[dm-devel] [PATCH 3/5] Add dm-vdo device mapper target implementation.

From: "J. corwin Coburn" <corwin@redhat.com>
To: dm-devel@redhat.com, linux-block@vger.kernel.org
Cc: corwin <corwin@redhat.com>
Subject: [dm-devel] [PATCH 3/5] Add dm-vdo device mapper target implementation.
Date: Mon,  8 May 2023 21:05:43 -0400	[thread overview]
Message-ID: <20230509010545.72448-4-corwin@redhat.com> (raw)
In-Reply-To: <20230509010545.72448-1-corwin@redhat.com>

From: corwin <corwin@redhat.com>

This adds the source files that implement the dm-vdo target.

Signed-off-by: corwin <corwin@redhat.com>
---
 drivers/md/dm-vdo/action-manager.c   |  410 ++
 drivers/md/dm-vdo/action-manager.h   |  117 +
 drivers/md/dm-vdo/admin-state.c      |  512 +++
 drivers/md/dm-vdo/admin-state.h      |  180 +
 drivers/md/dm-vdo/block-map.c        | 3388 +++++++++++++++++
 drivers/md/dm-vdo/block-map.h        |  391 ++
 drivers/md/dm-vdo/completion.c       |  141 +
 drivers/md/dm-vdo/completion.h       |  155 +
 drivers/md/dm-vdo/constants.c        |   15 +
 drivers/md/dm-vdo/constants.h        |  102 +
 drivers/md/dm-vdo/data-vio.c         | 2070 ++++++++++
 drivers/md/dm-vdo/data-vio.h         |  689 ++++
 drivers/md/dm-vdo/dedupe.c           | 3073 +++++++++++++++
 drivers/md/dm-vdo/dedupe.h           |  120 +
 drivers/md/dm-vdo/dump.c             |  288 ++
 drivers/md/dm-vdo/dump.h             |   17 +
 drivers/md/dm-vdo/encodings.c        | 1523 ++++++++
 drivers/md/dm-vdo/encodings.h        | 1307 +++++++
 drivers/md/dm-vdo/flush.c            |  563 +++
 drivers/md/dm-vdo/flush.h            |   44 +
 drivers/md/dm-vdo/int-map.c          |  710 ++++
 drivers/md/dm-vdo/int-map.h          |   40 +
 drivers/md/dm-vdo/io-submitter.c     |  483 +++
 drivers/md/dm-vdo/io-submitter.h     |   52 +
 drivers/md/dm-vdo/logical-zone.c     |  378 ++
 drivers/md/dm-vdo/logical-zone.h     |   87 +
 drivers/md/dm-vdo/message-stats.c    | 1222 ++++++
 drivers/md/dm-vdo/message-stats.h    |   13 +
 drivers/md/dm-vdo/packer.c           |  794 ++++
 drivers/md/dm-vdo/packer.h           |  123 +
 drivers/md/dm-vdo/physical-zone.c    |  650 ++++
 drivers/md/dm-vdo/physical-zone.h    |  115 +
 drivers/md/dm-vdo/pointer-map.c      |  691 ++++
 drivers/md/dm-vdo/pointer-map.h      |   81 +
 drivers/md/dm-vdo/pool-sysfs-stats.c | 2063 ++++++++++
 drivers/md/dm-vdo/pool-sysfs.c       |  193 +
 drivers/md/dm-vdo/pool-sysfs.h       |   19 +
 drivers/md/dm-vdo/priority-table.c   |  226 ++
 drivers/md/dm-vdo/priority-table.h   |   48 +
 drivers/md/dm-vdo/recovery-journal.c | 1772 +++++++++
 drivers/md/dm-vdo/recovery-journal.h |  313 ++
 drivers/md/dm-vdo/release-versions.h |   20 +
 drivers/md/dm-vdo/repair.c           | 1775 +++++++++
 drivers/md/dm-vdo/repair.h           |   14 +
 drivers/md/dm-vdo/slab-depot.c       | 5212 ++++++++++++++++++++++++++
 drivers/md/dm-vdo/slab-depot.h       |  594 +++
 drivers/md/dm-vdo/statistics.h       |  279 ++
 drivers/md/dm-vdo/status-codes.c     |  127 +
 drivers/md/dm-vdo/status-codes.h     |  112 +
 drivers/md/dm-vdo/sysfs.c            |   84 +
 drivers/md/dm-vdo/types.h            |  403 ++
 drivers/md/dm-vdo/vdo.c              | 1846 +++++++++
 drivers/md/dm-vdo/vdo.h              |  381 ++
 drivers/md/dm-vdo/vio.c              |  525 +++
 drivers/md/dm-vdo/vio.h              |  221 ++
 drivers/md/dm-vdo/wait-queue.c       |  223 ++
 drivers/md/dm-vdo/wait-queue.h       |  129 +
 drivers/md/dm-vdo/work-queue.c       |  658 ++++
 drivers/md/dm-vdo/work-queue.h       |   53 +
 59 files changed, 37834 insertions(+)
 create mode 100644 drivers/md/dm-vdo/action-manager.c
 create mode 100644 drivers/md/dm-vdo/action-manager.h
 create mode 100644 drivers/md/dm-vdo/admin-state.c
 create mode 100644 drivers/md/dm-vdo/admin-state.h
 create mode 100644 drivers/md/dm-vdo/block-map.c
 create mode 100644 drivers/md/dm-vdo/block-map.h
 create mode 100644 drivers/md/dm-vdo/completion.c
 create mode 100644 drivers/md/dm-vdo/completion.h
 create mode 100644 drivers/md/dm-vdo/constants.c
 create mode 100644 drivers/md/dm-vdo/constants.h
 create mode 100644 drivers/md/dm-vdo/data-vio.c
 create mode 100644 drivers/md/dm-vdo/data-vio.h
 create mode 100644 drivers/md/dm-vdo/dedupe.c
 create mode 100644 drivers/md/dm-vdo/dedupe.h
 create mode 100644 drivers/md/dm-vdo/dump.c
 create mode 100644 drivers/md/dm-vdo/dump.h
 create mode 100644 drivers/md/dm-vdo/encodings.c
 create mode 100644 drivers/md/dm-vdo/encodings.h
 create mode 100644 drivers/md/dm-vdo/flush.c
 create mode 100644 drivers/md/dm-vdo/flush.h
 create mode 100644 drivers/md/dm-vdo/int-map.c
 create mode 100644 drivers/md/dm-vdo/int-map.h
 create mode 100644 drivers/md/dm-vdo/io-submitter.c
 create mode 100644 drivers/md/dm-vdo/io-submitter.h
 create mode 100644 drivers/md/dm-vdo/logical-zone.c
 create mode 100644 drivers/md/dm-vdo/logical-zone.h
 create mode 100644 drivers/md/dm-vdo/message-stats.c
 create mode 100644 drivers/md/dm-vdo/message-stats.h
 create mode 100644 drivers/md/dm-vdo/packer.c
 create mode 100644 drivers/md/dm-vdo/packer.h
 create mode 100644 drivers/md/dm-vdo/physical-zone.c
 create mode 100644 drivers/md/dm-vdo/physical-zone.h
 create mode 100644 drivers/md/dm-vdo/pointer-map.c
 create mode 100644 drivers/md/dm-vdo/pointer-map.h
 create mode 100644 drivers/md/dm-vdo/pool-sysfs-stats.c
 create mode 100644 drivers/md/dm-vdo/pool-sysfs.c
 create mode 100644 drivers/md/dm-vdo/pool-sysfs.h
 create mode 100644 drivers/md/dm-vdo/priority-table.c
 create mode 100644 drivers/md/dm-vdo/priority-table.h
 create mode 100644 drivers/md/dm-vdo/recovery-journal.c
 create mode 100644 drivers/md/dm-vdo/recovery-journal.h
 create mode 100644 drivers/md/dm-vdo/release-versions.h
 create mode 100644 drivers/md/dm-vdo/repair.c
 create mode 100644 drivers/md/dm-vdo/repair.h
 create mode 100644 drivers/md/dm-vdo/slab-depot.c
 create mode 100644 drivers/md/dm-vdo/slab-depot.h
 create mode 100644 drivers/md/dm-vdo/statistics.h
 create mode 100644 drivers/md/dm-vdo/status-codes.c
 create mode 100644 drivers/md/dm-vdo/status-codes.h
 create mode 100644 drivers/md/dm-vdo/sysfs.c
 create mode 100644 drivers/md/dm-vdo/types.h
 create mode 100644 drivers/md/dm-vdo/vdo.c
 create mode 100644 drivers/md/dm-vdo/vdo.h
 create mode 100644 drivers/md/dm-vdo/vio.c
 create mode 100644 drivers/md/dm-vdo/vio.h
 create mode 100644 drivers/md/dm-vdo/wait-queue.c
 create mode 100644 drivers/md/dm-vdo/wait-queue.h
 create mode 100644 drivers/md/dm-vdo/work-queue.c
 create mode 100644 drivers/md/dm-vdo/work-queue.h

diff --git a/drivers/md/dm-vdo/action-manager.c b/drivers/md/dm-vdo/action-manager.c
new file mode 100644
index 00000000000..76880677aaf
--- /dev/null
+++ b/drivers/md/dm-vdo/action-manager.c
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "action-manager.h"
+
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "admin-state.h"
+#include "completion.h"
+#include "status-codes.h"
+#include "types.h"
+#include "vdo.h"
+
+/**
+ * struct action - An action to be performed in each of a set of zones.
+ * @in_use: Whether this structure is in use.
+ * @operation: The admin operation associated with this action.
+ * @preamble: The method to run on the initiator thread before the action is applied to each zone.
+ * @zone_action: The action to be performed in each zone.
+ * @conclusion: The method to run on the initiator thread after the action is applied to each zone.
+ * @parent: The object to notify when the action is complete.
+ * @context: The action specific context.
+ * @next: The action to perform after this one.
+ */
+struct action {
+	bool in_use;
+	const struct admin_state_code *operation;
+	vdo_action_preamble *preamble;
+	vdo_zone_action *zone_action;
+	vdo_action_conclusion *conclusion;
+	struct vdo_completion *parent;
+	void *context;
+	struct action *next;
+};
+
+/**
+ * struct action_manager - Definition of an action manager.
+ * @completion: The completion for performing actions.
+ * @state: The state of this action manager.
+ * @actions: The two action slots.
+ * @current_action: The current action slot.
+ * @zones: The number of zones in which an action is to be applied.
+ * @Scheduler: A function to schedule a default next action.
+ * @get_zone_thread_id: A function to get the id of the thread on which to apply an action to a
+ *                      zone.
+ * @initiator_thread_id: The ID of the thread on which actions may be initiated.
+ * @context: Opaque data associated with this action manager.
+ * @acting_zone: The zone currently being acted upon.
+ */
+struct action_manager {
+	struct vdo_completion completion;
+	struct admin_state state;
+	struct action actions[2];
+	struct action *current_action;
+	zone_count_t zones;
+	vdo_action_scheduler *scheduler;
+	vdo_zone_thread_getter *get_zone_thread_id;
+	thread_id_t initiator_thread_id;
+	void *context;
+	zone_count_t acting_zone;
+};
+
+static inline struct action_manager *as_action_manager(struct vdo_completion *completion)
+{
+	vdo_assert_completion_type(completion, VDO_ACTION_COMPLETION);
+	return container_of(completion, struct action_manager, completion);
+}
+
+/* Implements vdo_action_scheduler. */
+static bool no_default_action(void *context __always_unused)
+{
+	return false;
+}
+
+/* Implements vdo_action_preamble. */
+static void no_preamble(void *context __always_unused,
+			struct vdo_completion *completion)
+{
+	vdo_finish_completion(completion);
+}
+
+/* Implements vdo_action_conclusion. */
+static int no_conclusion(void *context __always_unused)
+{
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_make_action_manager() - Make an action manager.
+ * @zones: The number of zones to which actions will be applied.
+ * @get_zone_thread_id: A function to get the thread id associated with a zone.
+ * @initiator_thread_id: The thread on which actions may initiated.
+ * @context: The object which holds the per-zone context for the action.
+ * @scheduler: A function to schedule a next action after an action concludes if there is no
+ *             pending action (may be NULL).
+ * @vdo: The vdo used to initialize completions.
+ * @manager_ptr: A pointer to hold the new action manager.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_make_action_manager(zone_count_t zones,
+			    vdo_zone_thread_getter *get_zone_thread_id,
+			    thread_id_t initiator_thread_id,
+			    void *context,
+			    vdo_action_scheduler *scheduler,
+			    struct vdo *vdo,
+			    struct action_manager **manager_ptr)
+{
+	struct action_manager *manager;
+	int result = UDS_ALLOCATE(1, struct action_manager, __func__, &manager);
+
+	if (result != VDO_SUCCESS)
+		return result;
+
+	*manager = (struct action_manager) {
+		.zones = zones,
+		.scheduler =
+			((scheduler == NULL) ? no_default_action : scheduler),
+		.get_zone_thread_id = get_zone_thread_id,
+		.initiator_thread_id = initiator_thread_id,
+		.context = context,
+	};
+
+	manager->actions[0].next = &manager->actions[1];
+	manager->current_action = manager->actions[1].next =
+		&manager->actions[0];
+	vdo_set_admin_state_code(&manager->state,
+				 VDO_ADMIN_STATE_NORMAL_OPERATION);
+	vdo_initialize_completion(&manager->completion, vdo,
+				  VDO_ACTION_COMPLETION);
+	*manager_ptr = manager;
+	return VDO_SUCCESS;
+}
+
+const struct admin_state_code *vdo_get_current_manager_operation(struct action_manager *manager)
+{
+	return vdo_get_admin_state_code(&manager->state);
+}
+
+void *vdo_get_current_action_context(struct action_manager *manager)
+{
+	return manager->current_action->in_use ? manager->current_action->context : NULL;
+}
+
+static void finish_action_callback(struct vdo_completion *completion);
+static void apply_to_zone(struct vdo_completion *completion);
+
+static thread_id_t get_acting_zone_thread_id(struct action_manager *manager)
+{
+	return manager->get_zone_thread_id(manager->context, manager->acting_zone);
+}
+
+static void preserve_error(struct vdo_completion *completion)
+{
+	if (completion->parent != NULL)
+		vdo_set_completion_result(completion->parent, completion->result);
+
+	vdo_reset_completion(completion);
+	vdo_run_completion(completion);
+}
+
+static void prepare_for_next_zone(struct action_manager *manager)
+{
+	vdo_prepare_completion_for_requeue(&manager->completion,
+					   apply_to_zone,
+					   preserve_error,
+					   get_acting_zone_thread_id(manager),
+					   manager->current_action->parent);
+}
+
+static void prepare_for_conclusion(struct action_manager *manager)
+{
+	vdo_prepare_completion_for_requeue(&manager->completion,
+					   finish_action_callback,
+					   preserve_error,
+					   manager->initiator_thread_id,
+					   manager->current_action->parent);
+}
+
+static void apply_to_zone(struct vdo_completion *completion)
+{
+	zone_count_t zone;
+	struct action_manager *manager = as_action_manager(completion);
+
+	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == get_acting_zone_thread_id(manager)),
+			"%s() called on acting zones's thread",
+			__func__);
+
+	zone = manager->acting_zone++;
+	if (manager->acting_zone == manager->zones)
+		/*
+		 * We are about to apply to the last zone. Once that is finished, we're done, so go
+		 * back to the initiator thread and finish up.
+		 */
+		prepare_for_conclusion(manager);
+	else
+		/* Prepare to come back on the next zone */
+		prepare_for_next_zone(manager);
+
+	manager->current_action->zone_action(manager->context, zone, completion);
+}
+
+static void handle_preamble_error(struct vdo_completion *completion)
+{
+	/* Skip the zone actions since the preamble failed. */
+	completion->callback = finish_action_callback;
+	preserve_error(completion);
+}
+
+static void launch_current_action(struct action_manager *manager)
+{
+	struct action *action = manager->current_action;
+	int result = vdo_start_operation(&manager->state, action->operation);
+
+	if (result != VDO_SUCCESS) {
+		if (action->parent != NULL)
+			vdo_set_completion_result(action->parent, result);
+
+		/* We aren't going to run the preamble, so don't run the conclusion */
+		action->conclusion = no_conclusion;
+		finish_action_callback(&manager->completion);
+		return;
+	}
+
+	if (action->zone_action == NULL) {
+		prepare_for_conclusion(manager);
+	} else {
+		manager->acting_zone = 0;
+		vdo_prepare_completion_for_requeue(&manager->completion,
+						   apply_to_zone,
+						   handle_preamble_error,
+						   get_acting_zone_thread_id(manager),
+						   manager->current_action->parent);
+	}
+
+	action->preamble(manager->context, &manager->completion);
+}
+
+/**
+ * vdo_schedule_default_action() - Attempt to schedule the default action.
+ * @manager: The action manager.
+ *
+ * If the manager is not operating normally, the action will not be scheduled.
+ *
+ * Return: true if an action was scheduled.
+ */
+bool vdo_schedule_default_action(struct action_manager *manager)
+{
+	/* Don't schedule a default action if we are operating or not in normal operation. */
+	const struct admin_state_code *code = vdo_get_current_manager_operation(manager);
+
+	return ((code == VDO_ADMIN_STATE_NORMAL_OPERATION) &&
+		manager->scheduler(manager->context));
+}
+
+static void finish_action_callback(struct vdo_completion *completion)
+{
+	bool has_next_action;
+	int result;
+	struct action_manager *manager = as_action_manager(completion);
+	struct action action = *(manager->current_action);
+
+	manager->current_action->in_use = false;
+	manager->current_action = manager->current_action->next;
+
+	/*
+	 * We need to check this now to avoid use-after-free issues if running the conclusion or
+	 * notifying the parent results in the manager being freed.
+	 */
+	has_next_action =
+		(manager->current_action->in_use || vdo_schedule_default_action(manager));
+	result = action.conclusion(manager->context);
+	vdo_finish_operation(&manager->state, VDO_SUCCESS);
+	if (action.parent != NULL)
+		vdo_continue_completion(action.parent, result);
+
+	if (has_next_action)
+		launch_current_action(manager);
+}
+
+/**
+ * vdo_schedule_action() - Schedule an action to be applied to all zones.
+ * @manager: The action manager to schedule the action on.
+ * @preamble: A method to be invoked on the initiator thread once this action is started but before
+ *            applying to each zone; may be NULL.
+ * @action: The action to apply to each zone; may be NULL.
+ * @conclusion: A method to be invoked back on the initiator thread once the action has been
+ *              applied to all zones; may be NULL.
+ * @parent: The object to notify once the action is complete or if the action can not be scheduled;
+ *          may be NULL.
+ *
+ * The action will be launched immediately if there is no current action, or as soon as the current
+ * action completes. If there is already a pending action, this action will not be scheduled, and,
+ * if it has a parent, that parent will be notified. At least one of the preamble, action, or
+ * conclusion must not be NULL.
+ *
+ * Return: true if the action was scheduled.
+ */
+bool vdo_schedule_action(struct action_manager *manager,
+			 vdo_action_preamble *preamble,
+			 vdo_zone_action *action,
+			 vdo_action_conclusion *conclusion,
+			 struct vdo_completion *parent)
+{
+	return vdo_schedule_operation(manager,
+				      VDO_ADMIN_STATE_OPERATING,
+				      preamble,
+				      action,
+				      conclusion,
+				      parent);
+}
+
+/**
+ * vdo_schedule_operation() - Schedule an operation to be applied to all zones.
+ * @manager: The action manager to schedule the action on.
+ * @operation: The operation this action will perform
+ * @preamble: A method to be invoked on the initiator thread once this action is started but before
+ *            applying to each zone; may be NULL.
+ * @action: The action to apply to each zone; may be NULL.
+ * @conclusion: A method to be invoked back on the initiator thread once the action has been
+ *              applied to all zones; may be NULL.
+ * @parent: The object to notify once the action is complete or if the action can not be scheduled;
+ *          may be NULL.
+ *
+ * The operation's action will be launched immediately if there is no current action, or as soon as
+ * the current action completes. If there is already a pending action, this operation will not be
+ * scheduled, and, if it has a parent, that parent will be notified. At least one of the preamble,
+ * action, or conclusion must not be NULL.
+ *
+ * Return: true if the action was scheduled.
+ */
+bool vdo_schedule_operation(struct action_manager *manager,
+			    const struct admin_state_code *operation,
+			    vdo_action_preamble *preamble,
+			    vdo_zone_action *action,
+			    vdo_action_conclusion *conclusion,
+			    struct vdo_completion *parent)
+{
+	return vdo_schedule_operation_with_context(manager,
+						   operation,
+						   preamble,
+						   action,
+						   conclusion,
+						   NULL,
+						   parent);
+}
+
+/**
+ * vdo_schedule_operation_with_context() - Schedule an operation on all zones.
+ * @manager: The action manager to schedule the action on.
+ * @operation: The operation this action will perform.
+ * @preamble: A method to be invoked on the initiator thread once this action is started but before
+ *            applying to each zone; may be NULL.
+ * @action: The action to apply to each zone; may be NULL.
+ * @conclusion: A method to be invoked back on the initiator thread once the action has been
+ *              applied to all zones; may be NULL.
+ * @context: An action-specific context which may be retrieved via
+ *           vdo_get_current_action_context(); may be NULL.
+ * @parent: The object to notify once the action is complete or if the action can not be scheduled;
+ *          may be NULL.
+ *
+ * The operation's action will be launched immediately if there is no current action, or as soon as
+ * the current action completes. If there is already a pending action, this operation will not be
+ * scheduled, and, if it has a parent, that parent will be notified. At least one of the preamble,
+ * action, or conclusion must not be NULL.
+ *
+ * Return: true if the action was scheduled
+ */
+bool vdo_schedule_operation_with_context(struct action_manager *manager,
+					 const struct admin_state_code *operation,
+					 vdo_action_preamble *preamble,
+					 vdo_zone_action *action,
+					 vdo_action_conclusion *conclusion,
+					 void *context,
+					 struct vdo_completion *parent)
+{
+	struct action *current_action;
+
+	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == manager->initiator_thread_id),
+			"action initiated from correct thread");
+	if (!manager->current_action->in_use) {
+		current_action = manager->current_action;
+	} else if (!manager->current_action->next->in_use) {
+		current_action = manager->current_action->next;
+	} else {
+		if (parent != NULL)
+			vdo_continue_completion(parent, VDO_COMPONENT_BUSY);
+
+		return false;
+	}
+
+	*current_action = (struct action) {
+		.in_use = true,
+		.operation = operation,
+		.preamble = (preamble == NULL) ? no_preamble : preamble,
+		.zone_action = action,
+		.conclusion = (conclusion == NULL) ? no_conclusion : conclusion,
+		.context = context,
+		.parent = parent,
+		.next = current_action->next,
+	};
+
+	if (current_action == manager->current_action)
+		launch_current_action(manager);
+
+	return true;
+}
diff --git a/drivers/md/dm-vdo/action-manager.h b/drivers/md/dm-vdo/action-manager.h
new file mode 100644
index 00000000000..a04c6c0d43b
--- /dev/null
+++ b/drivers/md/dm-vdo/action-manager.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_ACTION_MANAGER_H
+#define VDO_ACTION_MANAGER_H
+
+#include "admin-state.h"
+#include "types.h"
+
+/*
+ * An action_manager provides a generic mechanism for applying actions to multi-zone entities (such
+ * as the block map or slab depot). Each action manager is tied to a specific context for which it
+ * manages actions. The manager ensures that only one action is active on that context at a time,
+ * and supports at most one pending action. Calls to schedule an action when there is already a
+ * pending action will result in VDO_COMPONENT_BUSY errors. Actions may only be submitted to the
+ * action manager from a single thread (which thread is determined when the action manager is
+ * constructed).
+ *
+ * A scheduled action consists of four components:
+ *
+ *   preamble
+ *      an optional method to be run on the initator thread before applying the action to all zones
+ *   zone_action
+ *      an optional method to be applied to each of the zones
+ *   conclusion
+ *      an optional method to be run on the initiator thread once the per-zone method has been
+ *      applied to all zones
+ *   parent
+ *     an optional completion to be finished once the conclusion is done
+ *
+ * At least one of the three methods must be provided.
+ */
+
+/*
+ * A function which is to be applied asynchronously to a set of zones.
+ * @context: The object which holds the per-zone context for the action
+ * @zone_number: The number of zone to which the action is being applied
+ * @parent: The object to notify when the action is complete
+ */
+typedef void vdo_zone_action(void *context,
+			     zone_count_t zone_number,
+			     struct vdo_completion *parent);
+
+/*
+ * A function which is to be applied asynchronously on an action manager's initiator thread as the
+ * preamble of an action.
+ * @context: The object which holds the per-zone context for the action
+ * @parent: The object to notify when the action is complete
+ */
+typedef void vdo_action_preamble(void *context, struct vdo_completion *parent);
+
+/*
+ * A function which will run on the action manager's initiator thread as the conclusion of an
+ * action.
+ * @context: The object which holds the per-zone context for the action
+ *
+ * Return: VDO_SUCCESS or an error
+ */
+typedef int vdo_action_conclusion(void *context);
+
+/*
+ * A function to schedule an action.
+ * @context: The object which holds the per-zone context for the action
+ *
+ * Return: true if an action was scheduled
+ */
+typedef bool vdo_action_scheduler(void *context);
+
+/*
+ * A function to get the id of the thread associated with a given zone.
+ * @context: The action context
+ * @zone_number: The number of the zone for which the thread ID is desired
+ */
+typedef thread_id_t vdo_zone_thread_getter(void *context, zone_count_t zone_number);
+
+struct action_manager;
+
+int __must_check
+vdo_make_action_manager(zone_count_t zones,
+			vdo_zone_thread_getter *get_zone_thread_id,
+			thread_id_t initiator_thread_id,
+			void *context,
+			vdo_action_scheduler *scheduler,
+			struct vdo *vdo,
+			struct action_manager **manager_ptr);
+
+const struct admin_state_code *__must_check
+vdo_get_current_manager_operation(struct action_manager *manager);
+
+void * __must_check vdo_get_current_action_context(struct action_manager *manager);
+
+bool vdo_schedule_default_action(struct action_manager *manager);
+
+bool vdo_schedule_action(struct action_manager *manager,
+			 vdo_action_preamble *preamble,
+			 vdo_zone_action *action,
+			 vdo_action_conclusion *conclusion,
+			 struct vdo_completion *parent);
+
+bool vdo_schedule_operation(struct action_manager *manager,
+			    const struct admin_state_code *operation,
+			    vdo_action_preamble *preamble,
+			    vdo_zone_action *action,
+			    vdo_action_conclusion *conclusion,
+			    struct vdo_completion *parent);
+
+bool vdo_schedule_operation_with_context(struct action_manager *manager,
+					 const struct admin_state_code *operation,
+					 vdo_action_preamble *preamble,
+					 vdo_zone_action *action,
+					 vdo_action_conclusion *conclusion,
+					 void *context,
+					 struct vdo_completion *parent);
+
+#endif /* VDO_ACTION_MANAGER_H */
diff --git a/drivers/md/dm-vdo/admin-state.c b/drivers/md/dm-vdo/admin-state.c
new file mode 100644
index 00000000000..87cb0e28369
--- /dev/null
+++ b/drivers/md/dm-vdo/admin-state.c
@@ -0,0 +1,512 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "admin-state.h"
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "completion.h"
+#include "types.h"
+
+static const struct admin_state_code VDO_CODE_NORMAL_OPERATION = {
+	.name = "VDO_ADMIN_STATE_NORMAL_OPERATION",
+	.normal = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_NORMAL_OPERATION = &VDO_CODE_NORMAL_OPERATION;
+static const struct admin_state_code VDO_CODE_OPERATING = {
+	.name = "VDO_ADMIN_STATE_OPERATING",
+	.normal = true,
+	.operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_OPERATING = &VDO_CODE_OPERATING;
+static const struct admin_state_code VDO_CODE_FORMATTING = {
+	.name = "VDO_ADMIN_STATE_FORMATTING",
+	.operating = true,
+	.loading = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_FORMATTING = &VDO_CODE_FORMATTING;
+static const struct admin_state_code VDO_CODE_PRE_LOADING = {
+	.name = "VDO_ADMIN_STATE_PRE_LOADING",
+	.operating = true,
+	.loading = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_PRE_LOADING = &VDO_CODE_PRE_LOADING;
+static const struct admin_state_code VDO_CODE_PRE_LOADED = {
+	.name = "VDO_ADMIN_STATE_PRE_LOADED",
+};
+const struct admin_state_code *VDO_ADMIN_STATE_PRE_LOADED = &VDO_CODE_PRE_LOADED;
+static const struct admin_state_code VDO_CODE_LOADING = {
+	.name = "VDO_ADMIN_STATE_LOADING",
+	.normal = true,
+	.operating = true,
+	.loading = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_LOADING = &VDO_CODE_LOADING;
+static const struct admin_state_code VDO_CODE_LOADING_FOR_RECOVERY = {
+	.name = "VDO_ADMIN_STATE_LOADING_FOR_RECOVERY",
+	.operating = true,
+	.loading = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_LOADING_FOR_RECOVERY =
+	&VDO_CODE_LOADING_FOR_RECOVERY;
+static const struct admin_state_code VDO_CODE_LOADING_FOR_REBUILD = {
+	.name = "VDO_ADMIN_STATE_LOADING_FOR_REBUILD",
+	.operating = true,
+	.loading = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_LOADING_FOR_REBUILD = &VDO_CODE_LOADING_FOR_REBUILD;
+static const struct admin_state_code VDO_CODE_WAITING_FOR_RECOVERY = {
+	.name = "VDO_ADMIN_STATE_WAITING_FOR_RECOVERY",
+	.operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_WAITING_FOR_RECOVERY =
+	&VDO_CODE_WAITING_FOR_RECOVERY;
+static const struct admin_state_code VDO_CODE_NEW = {
+	.name = "VDO_ADMIN_STATE_NEW",
+	.quiescent = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_NEW = &VDO_CODE_NEW;
+static const struct admin_state_code VDO_CODE_INITIALIZED = {
+	.name = "VDO_ADMIN_STATE_INITIALIZED",
+};
+const struct admin_state_code *VDO_ADMIN_STATE_INITIALIZED = &VDO_CODE_INITIALIZED;
+static const struct admin_state_code VDO_CODE_RECOVERING = {
+	.name = "VDO_ADMIN_STATE_RECOVERING",
+	.draining = true,
+	.operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_RECOVERING = &VDO_CODE_RECOVERING;
+static const struct admin_state_code VDO_CODE_REBUILDING = {
+	.name = "VDO_ADMIN_STATE_REBUILDING",
+	.draining = true,
+	.operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_REBUILDING = &VDO_CODE_REBUILDING;
+static const struct admin_state_code VDO_CODE_SAVING = {
+	.name = "VDO_ADMIN_STATE_SAVING",
+	.draining = true,
+	.quiescing = true,
+	.operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_SAVING = &VDO_CODE_SAVING;
+static const struct admin_state_code VDO_CODE_SAVED = {
+	.name = "VDO_ADMIN_STATE_SAVED",
+	.quiescent = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_SAVED = &VDO_CODE_SAVED;
+static const struct admin_state_code VDO_CODE_SCRUBBING = {
+	.name = "VDO_ADMIN_STATE_SCRUBBING",
+	.draining = true,
+	.loading = true,
+	.operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_SCRUBBING = &VDO_CODE_SCRUBBING;
+static const struct admin_state_code VDO_CODE_SAVE_FOR_SCRUBBING = {
+	.name = "VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING",
+	.draining = true,
+	.operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING = &VDO_CODE_SAVE_FOR_SCRUBBING;
+static const struct admin_state_code VDO_CODE_STOPPING = {
+	.name = "VDO_ADMIN_STATE_STOPPING",
+	.draining = true,
+	.quiescing = true,
+	.operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_STOPPING = &VDO_CODE_STOPPING;
+static const struct admin_state_code VDO_CODE_STOPPED = {
+	.name = "VDO_ADMIN_STATE_STOPPED",
+	.quiescent = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_STOPPED = &VDO_CODE_STOPPED;
+static const struct admin_state_code VDO_CODE_SUSPENDING = {
+	.name = "VDO_ADMIN_STATE_SUSPENDING",
+	.draining = true,
+	.quiescing = true,
+	.operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDING = &VDO_CODE_SUSPENDING;
+static const struct admin_state_code VDO_CODE_SUSPENDED = {
+	.name = "VDO_ADMIN_STATE_SUSPENDED",
+	.quiescent = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDED = &VDO_CODE_SUSPENDED;
+static const struct admin_state_code VDO_CODE_SUSPENDED_OPERATION = {
+	.name = "VDO_ADMIN_STATE_SUSPENDED_OPERATION",
+	.operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDED_OPERATION = &VDO_CODE_SUSPENDED_OPERATION;
+static const struct admin_state_code VDO_CODE_RESUMING = {
+	.name = "VDO_ADMIN_STATE_RESUMING",
+	.operating = true,
+};
+const struct admin_state_code *VDO_ADMIN_STATE_RESUMING = &VDO_CODE_RESUMING;
+
+/**
+ * get_next_state() - Determine the state which should be set after a given operation completes
+ *                    based on the operation and the current state.
+ * @operation The operation to be started.
+ *
+ * Return: The state to set when the operation completes or NULL if the operation can not be
+ *         started in the current state.
+ */
+static const struct admin_state_code *
+get_next_state(const struct admin_state *state, const struct admin_state_code *operation)
+{
+	const struct admin_state_code *code = vdo_get_admin_state_code(state);
+
+	if (code->operating)
+		return NULL;
+
+	if (operation == VDO_ADMIN_STATE_SAVING)
+		return (code == VDO_ADMIN_STATE_NORMAL_OPERATION ? VDO_ADMIN_STATE_SAVED : NULL);
+
+	if (operation == VDO_ADMIN_STATE_SUSPENDING)
+		return (code == VDO_ADMIN_STATE_NORMAL_OPERATION
+			? VDO_ADMIN_STATE_SUSPENDED
+			: NULL);
+
+	if (operation == VDO_ADMIN_STATE_STOPPING)
+		return (code == VDO_ADMIN_STATE_NORMAL_OPERATION ? VDO_ADMIN_STATE_STOPPED : NULL);
+
+	if (operation == VDO_ADMIN_STATE_PRE_LOADING)
+		return (code == VDO_ADMIN_STATE_INITIALIZED ? VDO_ADMIN_STATE_PRE_LOADED : NULL);
+
+	if (operation == VDO_ADMIN_STATE_SUSPENDED_OPERATION)
+		return (((code == VDO_ADMIN_STATE_SUSPENDED) ||
+			 (code == VDO_ADMIN_STATE_SAVED)) ? code : NULL);
+
+	return VDO_ADMIN_STATE_NORMAL_OPERATION;
+}
+
+/**
+ * vdo_finish_operation() - Finish the current operation.
+ *
+ * Will notify the operation waiter if there is one. This method should be used for operations
+ * started with vdo_start_operation(). For operations which were started with vdo_start_draining(),
+ * use vdo_finish_draining() instead.
+ *
+ * Return: true if there was an operation to finish.
+ */
+bool vdo_finish_operation(struct admin_state *state, int result)
+{
+	if (!vdo_get_admin_state_code(state)->operating)
+		return false;
+
+	state->complete = state->starting;
+	if (state->waiter != NULL)
+		vdo_set_completion_result(state->waiter, result);
+
+	if (!state->starting) {
+		vdo_set_admin_state_code(state, state->next_state);
+		if (state->waiter != NULL)
+			vdo_launch_completion(UDS_FORGET(state->waiter));
+	}
+
+	return true;
+}
+
+/**
+ * begin_operation() - Begin an operation if it may be started given the current state.
+ * @waiter A completion to notify when the operation is complete; may be NULL.
+ * @initiator The vdo_admin_initiator to call if the operation may begin; may be NULL.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int __must_check begin_operation(struct admin_state *state,
+					const struct admin_state_code *operation,
+					struct vdo_completion *waiter,
+					vdo_admin_initiator *initiator)
+{
+	int result;
+	const struct admin_state_code *next_state = get_next_state(state, operation);
+
+	if (next_state == NULL) {
+		result = uds_log_error_strerror(VDO_INVALID_ADMIN_STATE,
+						"Can't start %s from %s",
+						operation->name,
+						vdo_get_admin_state_code(state)->name);
+	} else if (state->waiter != NULL) {
+		result = uds_log_error_strerror(VDO_COMPONENT_BUSY,
+						"Can't start %s with extant waiter",
+						operation->name);
+	} else {
+		state->waiter = waiter;
+		state->next_state = next_state;
+		vdo_set_admin_state_code(state, operation);
+		if (initiator != NULL) {
+			state->starting = true;
+			initiator(state);
+			state->starting = false;
+			if (state->complete)
+				vdo_finish_operation(state, VDO_SUCCESS);
+		}
+
+		return VDO_SUCCESS;
+	}
+
+	if (waiter != NULL)
+		vdo_continue_completion(waiter, result);
+
+	return result;
+}
+
+/**
+ * start_operation() - Start an operation if it may be started given the current state.
+ * @waiter     A completion to notify when the operation is complete.
+ * @initiator The vdo_admin_initiator to call if the operation may begin; may be NULL.
+ *
+ * Return: true if the operation was started.
+ */
+static inline bool __must_check start_operation(struct admin_state *state,
+						const struct admin_state_code *operation,
+						struct vdo_completion *waiter,
+						vdo_admin_initiator *initiator)
+{
+	return (begin_operation(state, operation, waiter, initiator) == VDO_SUCCESS);
+}
+
+/**
+ * check_code() - Check the result of a state validation.
+ * @valid true if the code is of an appropriate type.
+ * @code The code which failed to be of the correct type.
+ * @what What the code failed to be, for logging.
+ * @waiter The completion to notify of the error; may be NULL.
+ *
+ * If the result failed, log an invalid state error and, if there is a waiter, notify it.
+ *
+ * Return: The result of the check.
+ */
+static bool check_code(bool valid,
+		       const struct admin_state_code *code,
+		       const char *what,
+		       struct vdo_completion *waiter)
+{
+	int result;
+
+	if (valid)
+		return true;
+
+	result = uds_log_error_strerror(VDO_INVALID_ADMIN_STATE,
+					"%s is not a %s", code->name, what);
+	if (waiter != NULL)
+		vdo_continue_completion(waiter, result);
+
+	return false;
+}
+
+/**
+ * vdo_drain_operation() - Check that an operation is a drain.
+ * @waiter The completion to finish with an error if the operation is not a drain.
+ *
+ * Return: true if the specified operation is a drain.
+ */
+static bool __must_check
+assert_vdo_drain_operation(const struct admin_state_code *operation, struct vdo_completion *waiter)
+{
+	return check_code(operation->draining, operation, "drain operation", waiter);
+}
+
+/**
+ * vdo_start_draining() - Initiate a drain operation if the current state permits it.
+ * @operation The type of drain to initiate.
+ * @waiter The completion to notify when the drain is complete.
+ * @initiator The vdo_admin_initiator to call if the operation may begin; may be NULL.
+ *
+ * Return: true if the drain was initiated, if not the waiter will be notified.
+ */
+bool vdo_start_draining(struct admin_state *state,
+			const struct admin_state_code *operation,
+			struct vdo_completion *waiter,
+			vdo_admin_initiator *initiator)
+{
+	const struct admin_state_code *code = vdo_get_admin_state_code(state);
+
+	if (!assert_vdo_drain_operation(operation, waiter))
+		return false;
+
+	if (code->quiescent) {
+		vdo_launch_completion(waiter);
+		return false;
+	}
+
+	if (!code->normal) {
+		uds_log_error_strerror(VDO_INVALID_ADMIN_STATE,
+				       "can't start %s from %s",
+				       operation->name,
+				       code->name);
+		vdo_continue_completion(waiter, VDO_INVALID_ADMIN_STATE);
+		return false;
+	}
+
+	return start_operation(state, operation, waiter, initiator);
+}
+
+/**
+ * vdo_finish_draining() - Finish a drain operation if one was in progress.
+ *
+ * Return: true if the state was draining; will notify the waiter if so.
+ */
+bool vdo_finish_draining(struct admin_state *state)
+{
+	return vdo_finish_draining_with_result(state, VDO_SUCCESS);
+}
+
+/**
+ * vdo_finish_draining_with_result() - Finish a drain operation with a status code.
+ *
+ * Return: true if the state was draining; will notify the waiter if so.
+ */
+bool vdo_finish_draining_with_result(struct admin_state *state, int result)
+{
+	return (vdo_is_state_draining(state) && vdo_finish_operation(state, result));
+}
+
+/**
+ * vdo_assert_load_operation() - Check that an operation is a load.
+ * @waiter The completion to finish with an error if the operation is not a load.
+ *
+ * Return: true if the specified operation is a load.
+ */
+bool vdo_assert_load_operation(const struct admin_state_code *operation,
+			       struct vdo_completion *waiter)
+{
+	return check_code(operation->loading, operation, "load operation", waiter);
+}
+
+/**
+ * vdo_start_loading() - Initiate a load operation if the current state permits it.
+ * @operation The type of load to initiate.
+ * @waiter The completion to notify when the load is complete (may be NULL).
+ * @initiator The vdo_admin_initiator to call if the operation may begin; may be NULL.
+ *
+ * Return: true if the load was initiated, if not the waiter will be notified.
+ */
+bool vdo_start_loading(struct admin_state *state,
+		       const struct admin_state_code *operation,
+		       struct vdo_completion *waiter,
+		       vdo_admin_initiator *initiator)
+{
+	return (vdo_assert_load_operation(operation, waiter) &&
+		start_operation(state, operation, waiter, initiator));
+}
+
+/**
+ * vdo_finish_loading() - Finish a load operation if one was in progress.
+ *
+ * Return: true if the state was loading; will notify the waiter if so.
+ */
+bool vdo_finish_loading(struct admin_state *state)
+{
+	return vdo_finish_loading_with_result(state, VDO_SUCCESS);
+}
+
+/**
+ * vdo_finish_loading_with_result() - Finish a load operation with a status code.
+ * @result The result of the load operation.
+ *
+ * Return: true if the state was loading; will notify the waiter if so.
+ */
+bool vdo_finish_loading_with_result(struct admin_state *state, int result)
+{
+	return (vdo_is_state_loading(state) && vdo_finish_operation(state, result));
+}
+
+/**
+ * assert_vdo_resume_operation() - Check whether an admin_state_code is a resume operation.
+ * @waiter The completion to notify if the operation is not a resume operation; may be NULL.
+ *
+ * Return: true if the code is a resume operation.
+ */
+static bool __must_check assert_vdo_resume_operation(const struct admin_state_code *operation,
+						     struct vdo_completion *waiter)
+{
+	return check_code(operation == VDO_ADMIN_STATE_RESUMING,
+			  operation,
+			  "resume operation",
+			  waiter);
+}
+
+/**
+ * vdo_start_resuming() - Initiate a resume operation if the current state permits it.
+ * @operation The type of resume to start.
+ * @waiter The completion to notify when the resume is complete (may be NULL).
+ * @initiator The vdo_admin_initiator to call if the operation may begin; may be NULL.
+ *
+ * Return: true if the resume was initiated, if not the waiter will be notified.
+ */
+bool vdo_start_resuming(struct admin_state *state,
+			const struct admin_state_code *operation,
+			struct vdo_completion *waiter,
+			vdo_admin_initiator *initiator)
+{
+	return (assert_vdo_resume_operation(operation, waiter) &&
+		start_operation(state, operation, waiter, initiator));
+}
+
+/**
+ * vdo_finish_resuming() - Finish a resume operation if one was in progress.
+ *
+ * Return: true if the state was resuming; will notify the waiter if so.
+ */
+bool vdo_finish_resuming(struct admin_state *state)
+{
+	return vdo_finish_resuming_with_result(state, VDO_SUCCESS);
+}
+
+/**
+ * vdo_finish_resuming_with_result() - Finish a resume operation with a status code.
+ * @result The result of the resume operation.
+ *
+ * Return: true if the state was resuming; will notify the waiter if so.
+ */
+bool vdo_finish_resuming_with_result(struct admin_state *state, int result)
+{
+	return (vdo_is_state_resuming(state) && vdo_finish_operation(state, result));
+}
+
+/**
+ * vdo_resume_if_quiescent() - Change the state to normal operation if the current state is
+ *                             quiescent.
+ *
+ * Return: VDO_SUCCESS if the state resumed, VDO_INVALID_ADMIN_STATE otherwise.
+ */
+int vdo_resume_if_quiescent(struct admin_state *state)
+{
+	if (!vdo_is_state_quiescent(state))
+		return VDO_INVALID_ADMIN_STATE;
+
+	vdo_set_admin_state_code(state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_start_operation() - Attempt to start an operation.
+ *
+ * Return: VDO_SUCCESS if the operation was started, VDO_INVALID_ADMIN_STATE if not
+ */
+int vdo_start_operation(struct admin_state *state, const struct admin_state_code *operation)
+{
+	return vdo_start_operation_with_waiter(state, operation, NULL, NULL);
+}
+
+/**
+ * vdo_start_operation_with_waiter() - Attempt to start an operation.
+ * @waiter the completion to notify when the operation completes or fails to start; may be NULL.
+ * @initiator The vdo_admin_initiator to call if the operation may begin; may be NULL.
+ *
+ * Return: VDO_SUCCESS if the operation was started, VDO_INVALID_ADMIN_STATE if not
+ */
+int vdo_start_operation_with_waiter(struct admin_state *state,
+				    const struct admin_state_code *operation,
+				    struct vdo_completion *waiter,
+				    vdo_admin_initiator *initiator)
+{
+	return (check_code(operation->operating, operation, "operation", waiter) ?
+		begin_operation(state, operation, waiter, initiator) :
+		VDO_INVALID_ADMIN_STATE);
+}
diff --git a/drivers/md/dm-vdo/admin-state.h b/drivers/md/dm-vdo/admin-state.h
new file mode 100644
index 00000000000..925211a8a76
--- /dev/null
+++ b/drivers/md/dm-vdo/admin-state.h
@@ -0,0 +1,180 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_ADMIN_STATE_H
+#define VDO_ADMIN_STATE_H
+
+#include "completion.h"
+#include "types.h"
+
+struct admin_state_code {
+	const char *name;
+	/* Normal operation, data_vios may be active */
+	bool normal;
+	/* I/O is draining, new requests should not start */
+	bool draining;
+	/* This is a startup time operation */
+	bool loading;
+	/* The next state will be quiescent */
+	bool quiescing;
+	/* The VDO is quiescent, there should be no I/O */
+	bool quiescent;
+	/* Whether an operation is in progress and so no other operation may be started */
+	bool operating;
+};
+
+extern const struct admin_state_code *VDO_ADMIN_STATE_NORMAL_OPERATION;
+extern const struct admin_state_code *VDO_ADMIN_STATE_OPERATING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_FORMATTING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_PRE_LOADING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_PRE_LOADED;
+extern const struct admin_state_code *VDO_ADMIN_STATE_LOADING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_LOADING_FOR_RECOVERY;
+extern const struct admin_state_code *VDO_ADMIN_STATE_LOADING_FOR_REBUILD;
+extern const struct admin_state_code *VDO_ADMIN_STATE_WAITING_FOR_RECOVERY;
+extern const struct admin_state_code *VDO_ADMIN_STATE_NEW;
+extern const struct admin_state_code *VDO_ADMIN_STATE_INITIALIZED;
+extern const struct admin_state_code *VDO_ADMIN_STATE_RECOVERING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_REBUILDING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_SAVING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_SAVED;
+extern const struct admin_state_code *VDO_ADMIN_STATE_SCRUBBING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_STOPPING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_STOPPED;
+extern const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDING;
+extern const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDED;
+extern const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDED_OPERATION;
+extern const struct admin_state_code *VDO_ADMIN_STATE_RESUMING;
+
+struct admin_state {
+	const struct admin_state_code *current_state;
+	/* The next administrative state (when the current operation finishes) */
+	const struct admin_state_code *next_state;
+	/* A completion waiting on a state change */
+	struct vdo_completion *waiter;
+	/* Whether an operation is being initiated */
+	bool starting;
+	/* Whether an operation has completed in the initiator */
+	bool complete;
+};
+
+/**
+ * typedef vdo_admin_initiator - A method to be called once an admin operation may be initiated.
+ */
+typedef void vdo_admin_initiator(struct admin_state *state);
+
+static inline const struct admin_state_code * __must_check
+vdo_get_admin_state_code(const struct admin_state *state)
+{
+	return READ_ONCE(state->current_state);
+}
+
+/**
+ * vdo_set_admin_state_code() - Set the current admin state code.
+ *
+ * This function should be used primarily for initialization and by adminState internals. Most uses
+ * should go through the operation interfaces.
+ */
+static inline void
+vdo_set_admin_state_code(struct admin_state *state, const struct admin_state_code *code)
+{
+	WRITE_ONCE(state->current_state, code);
+}
+
+static inline bool __must_check vdo_is_state_normal(const struct admin_state *state)
+{
+	return vdo_get_admin_state_code(state)->normal;
+}
+
+static inline bool __must_check vdo_is_state_suspending(const struct admin_state *state)
+{
+	return (vdo_get_admin_state_code(state) == VDO_ADMIN_STATE_SUSPENDING);
+}
+
+static inline bool __must_check vdo_is_state_saving(const struct admin_state *state)
+{
+	return (vdo_get_admin_state_code(state) == VDO_ADMIN_STATE_SAVING);
+}
+
+static inline bool __must_check vdo_is_state_saved(const struct admin_state *state)
+{
+	return (vdo_get_admin_state_code(state) == VDO_ADMIN_STATE_SAVED);
+}
+
+static inline bool __must_check vdo_is_state_draining(const struct admin_state *state)
+{
+	return vdo_get_admin_state_code(state)->draining;
+}
+
+static inline bool __must_check vdo_is_state_loading(const struct admin_state *state)
+{
+	return vdo_get_admin_state_code(state)->loading;
+}
+
+static inline bool __must_check vdo_is_state_resuming(const struct admin_state *state)
+{
+	return (vdo_get_admin_state_code(state) == VDO_ADMIN_STATE_RESUMING);
+}
+
+static inline bool __must_check vdo_is_state_clean_load(const struct admin_state *state)
+{
+	const struct admin_state_code *code = vdo_get_admin_state_code(state);
+
+	return ((code == VDO_ADMIN_STATE_FORMATTING) || (code == VDO_ADMIN_STATE_LOADING));
+}
+
+static inline bool __must_check vdo_is_state_quiescing(const struct admin_state *state)
+{
+	return vdo_get_admin_state_code(state)->quiescing;
+}
+
+static inline bool __must_check vdo_is_state_quiescent(const struct admin_state *state)
+{
+	return vdo_get_admin_state_code(state)->quiescent;
+}
+
+bool vdo_start_draining(struct admin_state *state,
+			const struct admin_state_code *operation,
+			struct vdo_completion *waiter,
+			vdo_admin_initiator * initiator);
+
+bool vdo_finish_draining(struct admin_state *state);
+
+bool vdo_finish_draining_with_result(struct admin_state *state, int result);
+
+bool __must_check
+vdo_assert_load_operation(const struct admin_state_code *operation, struct vdo_completion *waiter);
+
+bool vdo_start_loading(struct admin_state *state,
+		       const struct admin_state_code *operation,
+		       struct vdo_completion *waiter,
+		       vdo_admin_initiator *initiator);
+
+bool vdo_finish_loading(struct admin_state *state);
+
+bool vdo_finish_loading_with_result(struct admin_state *state, int result);
+
+bool vdo_start_resuming(struct admin_state *state,
+			const struct admin_state_code *operation,
+			struct vdo_completion *waiter,
+			vdo_admin_initiator *initiator);
+
+bool vdo_finish_resuming(struct admin_state *state);
+
+bool vdo_finish_resuming_with_result(struct admin_state *state, int result);
+
+int vdo_resume_if_quiescent(struct admin_state *state);
+
+int vdo_start_operation(struct admin_state *state, const struct admin_state_code *operation);
+
+int vdo_start_operation_with_waiter(struct admin_state *state,
+				    const struct admin_state_code *operation,
+				    struct vdo_completion *waiter,
+				    vdo_admin_initiator *initiator);
+
+bool vdo_finish_operation(struct admin_state *state, int result);
+
+#endif /* VDO_ADMIN_STATE_H */
diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c
new file mode 100644
index 00000000000..4c9184cbc1a
--- /dev/null
+++ b/drivers/md/dm-vdo/block-map.c
@@ -0,0 +1,3388 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "block-map.h"
+
+#include <linux/bio.h>
+#include <linux/ratelimit.h>
+
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "action-manager.h"
+#include "admin-state.h"
+#include "completion.h"
+#include "constants.h"
+#include "data-vio.h"
+#include "encodings.h"
+#include "io-submitter.h"
+#include "physical-zone.h"
+#include "recovery-journal.h"
+#include "slab-depot.h"
+#include "status-codes.h"
+#include "types.h"
+#include "vdo.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+/**
+ * DOC: Block map eras
+ *
+ * The block map era, or maximum age, is used as follows:
+ *
+ * Each block map page, when dirty, records the earliest recovery journal block sequence number of
+ * the changes reflected in that dirty block. Sequence numbers are classified into eras: every
+ * @maximum_age sequence numbers, we switch to a new era. Block map pages are assigned to eras
+ * according to the sequence number they record.
+ *
+ * In the current (newest) era, block map pages are not written unless there is cache pressure. In
+ * the next oldest era, each time a new journal block is written 1/@maximum_age of the pages in
+ * this era are issued for write. In all older eras, pages are issued for write immediately.
+ */
+
+struct page_descriptor {
+	root_count_t root_index;
+	height_t height;
+	page_number_t page_index;
+	slot_number_t slot;
+} __packed;
+
+union page_key {
+	struct page_descriptor descriptor;
+	u64 key;
+};
+
+struct write_if_not_dirtied_context {
+	struct block_map_zone *zone;
+	u8 generation;
+};
+
+struct block_map_tree_segment {
+	struct tree_page *levels[VDO_BLOCK_MAP_TREE_HEIGHT];
+};
+
+struct block_map_tree {
+	struct block_map_tree_segment *segments;
+} block_map_tree;
+
+struct forest {
+	struct block_map *map;
+	size_t segments;
+	struct boundary *boundaries;
+	struct tree_page **pages;
+	struct block_map_tree trees[];
+};
+
+struct cursor_level {
+	page_number_t page_index;
+	slot_number_t slot;
+};
+
+struct cursors;
+
+struct cursor {
+	struct waiter waiter;
+	struct block_map_tree *tree;
+	height_t height;
+	struct cursors *parent;
+	struct boundary boundary;
+	struct cursor_level levels[VDO_BLOCK_MAP_TREE_HEIGHT];
+	struct pooled_vio *vio;
+};
+
+struct cursors {
+	struct block_map_zone *zone;
+	struct vio_pool *pool;
+	vdo_entry_callback *entry_callback;
+	struct vdo_completion *parent;
+	root_count_t active_roots;
+	struct cursor cursors[];
+};
+
+/* Used to indicate that the page holding the location of a tree root has been "loaded". */
+const physical_block_number_t VDO_INVALID_PBN = 0xFFFFFFFFFFFFFFFF;
+
+enum {
+	LOG_INTERVAL = 4000,
+	DISPLAY_INTERVAL = 100000,
+};
+
+/*
+ * For adjusting VDO page cache statistic fields which are only mutated on the logical zone thread.
+ * Prevents any compiler shenanigans from affecting other threads reading those stats.
+ */
+#define ADD_ONCE(value, delta) WRITE_ONCE(value, (value) + (delta))
+
+static inline bool is_dirty(const struct page_info *info)
+{
+	return info->state == PS_DIRTY;
+}
+
+static inline bool is_present(const struct page_info *info)
+{
+	return (info->state == PS_RESIDENT) || (info->state == PS_DIRTY);
+}
+
+static inline bool is_in_flight(const struct page_info *info)
+{
+	return (info->state == PS_INCOMING) || (info->state == PS_OUTGOING);
+}
+
+static inline bool is_incoming(const struct page_info *info)
+{
+	return info->state == PS_INCOMING;
+}
+
+static inline bool is_outgoing(const struct page_info *info)
+{
+	return info->state == PS_OUTGOING;
+}
+
+static inline bool is_valid(const struct page_info *info)
+{
+	return is_present(info) || is_outgoing(info);
+}
+
+static char *get_page_buffer(struct page_info *info)
+{
+	struct vdo_page_cache *cache = info->cache;
+
+	return &cache->pages[(info - cache->infos) * VDO_BLOCK_SIZE];
+}
+
+static inline struct vdo_page_completion *page_completion_from_waiter(struct waiter *waiter)
+{
+	struct vdo_page_completion *completion;
+
+	if (waiter == NULL)
+		return NULL;
+
+	completion = container_of(waiter, struct vdo_page_completion, waiter);
+	vdo_assert_completion_type(&completion->completion, VDO_PAGE_COMPLETION);
+	return completion;
+}
+
+/**
+ * initialize_info() - Initialize all page info structures and put them on the free list.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int initialize_info(struct vdo_page_cache *cache)
+{
+	struct page_info *info;
+
+	INIT_LIST_HEAD(&cache->free_list);
+	for (info = cache->infos; info < cache->infos + cache->page_count; ++info) {
+		int result;
+
+		info->cache = cache;
+		info->state = PS_FREE;
+		info->pbn = NO_PAGE;
+
+		result = create_metadata_vio(cache->vdo,
+					     VIO_TYPE_BLOCK_MAP,
+					     VIO_PRIORITY_METADATA, info,
+					     get_page_buffer(info),
+					     &info->vio);
+		if (result != VDO_SUCCESS)
+			return result;
+
+		/* The thread ID should never change. */
+		info->vio->completion.callback_thread_id = cache->zone->thread_id;
+
+		INIT_LIST_HEAD(&info->state_entry);
+		list_add_tail(&info->state_entry, &cache->free_list);
+		INIT_LIST_HEAD(&info->lru_entry);
+	}
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * allocate_cache_components() - Allocate components of the cache which require their own
+ *                               allocation.
+ * @maximum_age: The number of journal blocks before a dirtied page is considered old and must be
+ *               written out.
+ *
+ * The caller is responsible for all clean up on errors.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int __must_check allocate_cache_components(struct vdo_page_cache *cache)
+{
+	u64 size = cache->page_count * (u64) VDO_BLOCK_SIZE;
+	int result;
+
+	result = UDS_ALLOCATE(cache->page_count, struct page_info, "page infos", &cache->infos);
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = uds_allocate_memory(size, VDO_BLOCK_SIZE, "cache pages", &cache->pages);
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = vdo_make_int_map(cache->page_count, 0, &cache->page_map);
+	if (result != UDS_SUCCESS)
+		return result;
+
+	return initialize_info(cache);
+}
+
+/**
+ * assert_on_cache_thread() - Assert that a function has been called on the VDO page cache's
+ *                            thread.
+ */
+static inline void assert_on_cache_thread(struct vdo_page_cache *cache, const char *function_name)
+{
+	thread_id_t thread_id = vdo_get_callback_thread_id();
+
+	ASSERT_LOG_ONLY((thread_id == cache->zone->thread_id),
+			"%s() must only be called on cache thread %d, not thread %d",
+			function_name,
+			cache->zone->thread_id,
+			thread_id);
+}
+
+/** assert_io_allowed() - Assert that a page cache may issue I/O. */
+static inline void assert_io_allowed(struct vdo_page_cache *cache)
+{
+	ASSERT_LOG_ONLY(!vdo_is_state_quiescent(&cache->zone->state),
+			"VDO page cache may issue I/O");
+}
+
+/** report_cache_pressure() - Log and, if enabled, report cache pressure. */
+static void report_cache_pressure(struct vdo_page_cache *cache)
+{
+	ADD_ONCE(cache->stats.cache_pressure, 1);
+	if (cache->waiter_count > cache->page_count) {
+		if ((cache->pressure_report % LOG_INTERVAL) == 0)
+			uds_log_info("page cache pressure %u", cache->stats.cache_pressure);
+
+		if (++cache->pressure_report >= DISPLAY_INTERVAL)
+			cache->pressure_report = 0;
+	}
+}
+
+/**
+ * get_page_state_name() - Return the name of a page state.
+ *
+ * If the page state is invalid a static string is returned and the invalid state is logged.
+ *
+ * Return: A pointer to a static page state name.
+ */
+static const char * __must_check get_page_state_name(enum vdo_page_buffer_state state)
+{
+	int result;
+	static const char * const state_names[] = {
+		"UDS_FREE", "INCOMING", "FAILED", "RESIDENT", "DIRTY", "OUTGOING"
+	};
+
+	STATIC_ASSERT(ARRAY_SIZE(state_names) == PAGE_STATE_COUNT);
+
+	result = ASSERT(state < ARRAY_SIZE(state_names), "Unknown page_state value %d", state);
+	if (result != UDS_SUCCESS)
+		return "[UNKNOWN PAGE STATE]";
+
+	return state_names[state];
+}
+
+/**
+ * update_counter() - Update the counter associated with a given state.
+ * @info: The page info to count.
+ * @delta: The delta to apply to the counter.
+ */
+static void update_counter(struct page_info *info, s32 delta)
+{
+	struct block_map_statistics *stats = &info->cache->stats;
+
+	switch (info->state) {
+	case PS_FREE:
+		ADD_ONCE(stats->free_pages, delta);
+		return;
+
+	case PS_INCOMING:
+		ADD_ONCE(stats->incoming_pages, delta);
+		return;
+
+	case PS_OUTGOING:
+		ADD_ONCE(stats->outgoing_pages, delta);
+		return;
+
+	case PS_FAILED:
+		ADD_ONCE(stats->failed_pages, delta);
+		return;
+
+	case PS_RESIDENT:
+		ADD_ONCE(stats->clean_pages, delta);
+		return;
+
+	case PS_DIRTY:
+		ADD_ONCE(stats->dirty_pages, delta);
+		return;
+
+	default:
+		return;
+	}
+}
+
+/** update_lru() - Update the lru information for an active page. */
+static void update_lru(struct page_info *info)
+{
+	if (info->cache->lru_list.prev != &info->lru_entry)
+		list_move_tail(&info->lru_entry, &info->cache->lru_list);
+}
+
+/**
+ * set_info_state() - Set the state of a page_info and put it on the right list, adjusting
+ *                    counters.
+ */
+static void
+set_info_state(struct page_info *info, enum vdo_page_buffer_state new_state)
+{
+	if (new_state == info->state)
+		return;
+
+	update_counter(info, -1);
+	info->state = new_state;
+	update_counter(info, 1);
+
+	switch (info->state) {
+	case PS_FREE:
+	case PS_FAILED:
+		list_move_tail(&info->state_entry, &info->cache->free_list);
+		return;
+
+	case PS_OUTGOING:
+		list_move_tail(&info->state_entry, &info->cache->outgoing_list);
+		return;
+
+	case PS_DIRTY:
+		return;
+
+	default:
+		list_del_init(&info->state_entry);
+	}
+}
+
+/** set_info_pbn() - Set the pbn for an info, updating the map as needed. */
+static int __must_check set_info_pbn(struct page_info *info, physical_block_number_t pbn)
+{
+	struct vdo_page_cache *cache = info->cache;
+
+	/* Either the new or the old page number must be NO_PAGE. */
+	int result = ASSERT((pbn == NO_PAGE) || (info->pbn == NO_PAGE),
+			    "Must free a page before reusing it.");
+	if (result != VDO_SUCCESS)
+		return result;
+
+	if (info->pbn != NO_PAGE)
+		vdo_int_map_remove(cache->page_map, info->pbn);
+
+	info->pbn = pbn;
+
+	if (pbn != NO_PAGE) {
+		result = vdo_int_map_put(cache->page_map, pbn, info, true, NULL);
+		if (result != UDS_SUCCESS)
+			return result;
+	}
+	return VDO_SUCCESS;
+}
+
+/** reset_page_info() - Reset page info to represent an unallocated page. */
+static int reset_page_info(struct page_info *info)
+{
+	int result;
+
+	result = ASSERT(info->busy == 0, "VDO Page must not be busy");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = ASSERT(!vdo_has_waiters(&info->waiting), "VDO Page must not have waiters");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = set_info_pbn(info, NO_PAGE);
+	set_info_state(info, PS_FREE);
+	list_del_init(&info->lru_entry);
+	return result;
+}
+
+/**
+ * find_free_page() - Find a free page.
+ *
+ * Return: A pointer to the page info structure (if found), NULL otherwise.
+ */
+static struct page_info * __must_check find_free_page(struct vdo_page_cache *cache)
+{
+	struct page_info *info;
+
+	info = list_first_entry_or_null(&cache->free_list, struct page_info, state_entry);
+	if (info != NULL)
+		list_del_init(&info->state_entry);
+	return info;
+}
+
+/**
+ * find_page() - Find the page info (if any) associated with a given pbn.
+ * @pbn: The absolute physical block number of the page.
+ *
+ * Return: The page info for the page if available, or NULL if not.
+ */
+static struct page_info * __must_check
+find_page(struct vdo_page_cache *cache, physical_block_number_t pbn)
+{
+	if ((cache->last_found != NULL) && (cache->last_found->pbn == pbn))
+		return cache->last_found;
+	cache->last_found = vdo_int_map_get(cache->page_map, pbn);
+	return cache->last_found;
+}
+
+/**
+ * select_lru_page() - Determine which page is least recently used.
+ *
+ * Picks the least recently used from among the non-busy entries at the front of each of the lru
+ * ring. Since whenever we mark a page busy we also put it to the end of the ring it is unlikely
+ * that the entries at the front are busy unless the queue is very short, but not impossible.
+ *
+ * Return: A pointer to the info structure for a relevant page, or NULL if no such page can be
+ *         found. The page can be dirty or resident.
+ */
+static struct page_info * __must_check select_lru_page(struct vdo_page_cache *cache)
+{
+	struct page_info *info;
+
+	list_for_each_entry(info, &cache->lru_list, lru_entry)
+		if ((info->busy == 0) && !is_in_flight(info))
+			return info;
+
+	return NULL;
+}
+
+/* ASYNCHRONOUS INTERFACE BEYOND THIS POINT */
+
+/**
+ * complete_with_page() - Helper to complete the VDO Page Completion request successfully.
+ * @info: The page info representing the result page.
+ * @vdo_page_comp: The VDO page completion to complete.
+ */
+static void complete_with_page(struct page_info *info, struct vdo_page_completion *vdo_page_comp)
+{
+	bool available = vdo_page_comp->writable ? is_present(info) : is_valid(info);
+
+	if (!available) {
+		uds_log_error_strerror(VDO_BAD_PAGE,
+				       "Requested cache page %llu in state %s is not %s",
+				       (unsigned long long) info->pbn,
+				       get_page_state_name(info->state),
+				       vdo_page_comp->writable ? "present" :
+				       "valid");
+		vdo_fail_completion(&vdo_page_comp->completion, VDO_BAD_PAGE);
+		return;
+	}
+
+	vdo_page_comp->info = info;
+	vdo_page_comp->ready = true;
+	vdo_finish_completion(&vdo_page_comp->completion);
+}
+
+/**
+ * complete_waiter_with_error() - Complete a page completion with an error code.
+ * @waiter: The page completion, as a waiter.
+ * @result_ptr: A pointer to the error code.
+ *
+ * Implements waiter_callback.
+ */
+static void complete_waiter_with_error(struct waiter *waiter, void *result_ptr)
+{
+	int *result = result_ptr;
+
+	vdo_fail_completion(&page_completion_from_waiter(waiter)->completion, *result);
+}
+
+/**
+ * complete_waiter_with_page() - Complete a page completion with a page.
+ * @waiter: The page completion, as a waiter.
+ * @page_info: The page info to complete with.
+ *
+ * Implements waiter_callback.
+ */
+static void complete_waiter_with_page(struct waiter *waiter, void *page_info)
+{
+	complete_with_page((struct page_info *) page_info, page_completion_from_waiter(waiter));
+}
+
+/**
+ * distribute_page_over_queue() - Complete a queue of VDO page completions with a page result.
+ *
+ * Upon completion the queue will be empty.
+ *
+ * Return: The number of pages distributed.
+ */
+static unsigned int distribute_page_over_queue(struct page_info *info, struct wait_queue *queue)
+{
+	size_t pages;
+
+	update_lru(info);
+	pages = vdo_count_waiters(queue);
+
+	/*
+	 * Increment the busy count once for each pending completion so that this page does not
+	 * stop being busy until all completions have been processed (VDO-83).
+	 */
+	info->busy += pages;
+
+	vdo_notify_all_waiters(queue, complete_waiter_with_page, info);
+	return pages;
+}
+
+/**
+ * set_persistent_error() - Set a persistent error which all requests will receive in the future.
+ * @context: A string describing what triggered the error.
+ *
+ * Once triggered, all enqueued completions will get this error. Any future requests will result in
+ * this error as well.
+ */
+static void set_persistent_error(struct vdo_page_cache *cache, const char *context, int result)
+{
+	struct page_info *info;
+	/* If we're already read-only, there's no need to log. */
+	struct vdo *vdo = cache->zone->block_map->vdo;
+
+	if ((result != VDO_READ_ONLY) && !vdo_is_read_only(vdo)) {
+		uds_log_error_strerror(result, "VDO Page Cache persistent error: %s", context);
+		vdo_enter_read_only_mode(vdo, result);
+	}
+
+	assert_on_cache_thread(cache, __func__);
+
+	vdo_notify_all_waiters(&cache->free_waiters, complete_waiter_with_error, &result);
+	cache->waiter_count = 0;
+
+	for (info = cache->infos; info < cache->infos + cache->page_count; ++info)
+		vdo_notify_all_waiters(&info->waiting, complete_waiter_with_error, &result);
+}
+
+/**
+ * validate_completed_page() - Check that a page completion which is being freed to the cache
+ *                             referred to a valid page and is in a valid state.
+ * @writable: Whether a writable page is required.
+ *
+ * Return: VDO_SUCCESS if the page was valid, otherwise as error
+ */
+static int __must_check
+validate_completed_page(struct vdo_page_completion *completion, bool writable)
+{
+	int result;
+
+	result = ASSERT(completion->ready, "VDO Page completion not ready");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = ASSERT(completion->info != NULL, "VDO Page Completion must be complete");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = ASSERT(completion->info->pbn == completion->pbn,
+			"VDO Page Completion pbn must be consistent");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = ASSERT(is_valid(completion->info), "VDO Page Completion page must be valid");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	if (writable) {
+		result = ASSERT(completion->writable, "VDO Page Completion is writable");
+		if (result != UDS_SUCCESS)
+			return result;
+	}
+
+	return VDO_SUCCESS;
+}
+
+static void check_for_drain_complete(struct block_map_zone *zone)
+{
+	if (vdo_is_state_draining(&zone->state) &&
+	    (zone->active_lookups == 0) &&
+	    !vdo_has_waiters(&zone->flush_waiters) &&
+	    !is_vio_pool_busy(zone->vio_pool) &&
+	    (zone->page_cache.outstanding_reads == 0) &&
+	    (zone->page_cache.outstanding_writes == 0))
+		vdo_finish_draining_with_result(&zone->state,
+						(vdo_is_read_only(zone->block_map->vdo) ?
+						 VDO_READ_ONLY : VDO_SUCCESS));
+}
+
+static void enter_zone_read_only_mode(struct block_map_zone *zone, int result)
+{
+	vdo_enter_read_only_mode(zone->block_map->vdo, result);
+
+	/*
+	 * We are in read-only mode, so we won't ever write any page out. Just take all waiters off
+	 * the queue so the zone can drain.
+	 */
+	while (vdo_has_waiters(&zone->flush_waiters))
+		vdo_dequeue_next_waiter(&zone->flush_waiters);
+
+	check_for_drain_complete(zone);
+}
+
+static bool __must_check
+validate_completed_page_or_enter_read_only_mode(struct vdo_page_completion *completion,
+						bool writable)
+{
+	int result = validate_completed_page(completion, writable);
+
+	if (result == VDO_SUCCESS)
+		return true;
+
+	enter_zone_read_only_mode(completion->info->cache->zone, result);
+	return false;
+}
+
+/**
+ * handle_load_error() - Handle page load errors.
+ * @completion: The page read vio.
+ */
+static void handle_load_error(struct vdo_completion *completion)
+{
+	int result = completion->result;
+	struct page_info *info = completion->parent;
+	struct vdo_page_cache *cache = info->cache;
+
+	assert_on_cache_thread(cache, __func__);
+	vio_record_metadata_io_error(as_vio(completion));
+	vdo_enter_read_only_mode(cache->zone->block_map->vdo, result);
+	ADD_ONCE(cache->stats.failed_reads, 1);
+	set_info_state(info, PS_FAILED);
+	vdo_notify_all_waiters(&info->waiting, complete_waiter_with_error, &result);
+	reset_page_info(info);
+
+	/*
+	 * Don't decrement until right before calling check_for_drain_complete() to
+	 * ensure that the above work can't cause the page cache to be freed out from under us.
+	 */
+	cache->outstanding_reads--;
+	check_for_drain_complete(cache->zone);
+}
+
+/**
+ * page_is_loaded() - Callback used when a page has been loaded.
+ * @completion: The vio which has loaded the page. Its parent is the page_info.
+ */
+static void page_is_loaded(struct vdo_completion *completion)
+{
+	struct page_info *info = completion->parent;
+	struct vdo_page_cache *cache = info->cache;
+	nonce_t nonce = info->cache->zone->block_map->nonce;
+	struct block_map_page *page;
+	enum block_map_page_validity validity;
+
+	assert_on_cache_thread(cache, __func__);
+
+	page = (struct block_map_page *) get_page_buffer(info);
+	validity = vdo_validate_block_map_page(page, nonce, info->pbn);
+	if (validity == VDO_BLOCK_MAP_PAGE_BAD) {
+		int result = uds_log_error_strerror(VDO_BAD_PAGE,
+						    "Expected page %llu but got page %llu instead",
+						    (unsigned long long) info->pbn,
+						    (unsigned long long) vdo_get_block_map_page_pbn(page));
+
+		vdo_continue_completion(completion, result);
+		return;
+	}
+
+	if (validity == VDO_BLOCK_MAP_PAGE_INVALID)
+		vdo_format_block_map_page(page, nonce, info->pbn, false);
+
+	info->recovery_lock = 0;
+	set_info_state(info, PS_RESIDENT);
+	distribute_page_over_queue(info, &info->waiting);
+
+	/*
+	 * Don't decrement until right before calling check_for_drain_complete() to
+	 * ensure that the above work can't cause the page cache to be freed out from under us.
+	 */
+	cache->outstanding_reads--;
+	check_for_drain_complete(cache->zone);
+}
+
+/**
+ * handle_rebuild_read_error() - Handle a read error during a read-only rebuild.
+ * @completion: The page load completion.
+ */
+static void handle_rebuild_read_error(struct vdo_completion *completion)
+{
+	struct page_info *info = completion->parent;
+	struct vdo_page_cache *cache = info->cache;
+
+	assert_on_cache_thread(cache, __func__);
+
+	/*
+	 * We are doing a read-only rebuild, so treat this as a successful read of an uninitialized
+	 * page.
+	 */
+	vio_record_metadata_io_error(as_vio(completion));
+	ADD_ONCE(cache->stats.failed_reads, 1);
+	memset(get_page_buffer(info), 0, VDO_BLOCK_SIZE);
+	vdo_reset_completion(completion);
+	page_is_loaded(completion);
+}
+
+static void load_cache_page_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct page_info *info = vio->completion.parent;
+
+	continue_vio_after_io(vio, page_is_loaded, info->cache->zone->thread_id);
+}
+
+/**
+ * launch_page_load() - Begin the process of loading a page.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int __must_check launch_page_load(struct page_info *info, physical_block_number_t pbn)
+{
+	int result;
+	vdo_action *callback;
+	struct vdo_page_cache *cache = info->cache;
+
+	assert_io_allowed(cache);
+
+	result = set_info_pbn(info, pbn);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = ASSERT((info->busy == 0), "Page is not busy before loading.");
+	if (result != VDO_SUCCESS)
+		return result;
+
+	set_info_state(info, PS_INCOMING);
+	cache->outstanding_reads++;
+	ADD_ONCE(cache->stats.pages_loaded, 1);
+	callback = (cache->rebuilding ? handle_rebuild_read_error : handle_load_error);
+	submit_metadata_vio(info->vio,
+			    pbn,
+			    load_cache_page_endio,
+			    callback,
+			    REQ_OP_READ | REQ_PRIO);
+	return VDO_SUCCESS;
+}
+
+static void write_pages(struct vdo_completion *completion);
+
+/** handle_flush_error() - Handle errors flushing the layer. */
+static void handle_flush_error(struct vdo_completion *completion)
+{
+	struct page_info *info = completion->parent;
+
+	vio_record_metadata_io_error(as_vio(completion));
+	set_persistent_error(info->cache, "flush failed", completion->result);
+	write_pages(completion);
+}
+
+static void flush_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct page_info *info = vio->completion.parent;
+
+	continue_vio_after_io(vio, write_pages, info->cache->zone->thread_id);
+}
+
+/** save_pages() - Attempt to save the outgoing pages by first flushing the layer. */
+static void save_pages(struct vdo_page_cache *cache)
+{
+	struct page_info *info;
+	struct vio *vio;
+
+	if ((cache->pages_in_flush > 0) || (cache->pages_to_flush == 0))
+		return;
+
+	assert_io_allowed(cache);
+
+	info = list_first_entry(&cache->outgoing_list, struct page_info, state_entry);
+
+	cache->pages_in_flush = cache->pages_to_flush;
+	cache->pages_to_flush = 0;
+	ADD_ONCE(cache->stats.flush_count, 1);
+
+	vio = info->vio;
+
+	/*
+	 * We must make sure that the recovery journal entries that changed these pages were
+	 * successfully persisted, and thus must issue a flush before each batch of pages is
+	 * written to ensure this.
+	 */
+	submit_flush_vio(vio, flush_endio, handle_flush_error);
+}
+
+/**
+ * schedule_page_save() - Add a page to the outgoing list of pages waiting to be saved.
+ *
+ * Once in the list, a page may not be used until it has been written out.
+ */
+static void schedule_page_save(struct page_info *info)
+{
+	if (info->busy > 0) {
+		info->write_status = WRITE_STATUS_DEFERRED;
+		return;
+	}
+
+	info->cache->pages_to_flush++;
+	info->cache->outstanding_writes++;
+	set_info_state(info, PS_OUTGOING);
+}
+
+/**
+ * launch_page_save() - Add a page to outgoing pages waiting to be saved, and then start saving
+ * pages if another save is not in progress.
+ */
+static void launch_page_save(struct page_info *info)
+{
+	schedule_page_save(info);
+	save_pages(info->cache);
+}
+
+/**
+ * completion_needs_page() - Determine whether a given vdo_page_completion (as a waiter) is
+ *                           requesting a given page number.
+ * @context: A pointer to the pbn of the desired page.
+ *
+ * Implements waiter_match.
+ *
+ * Return: true if the page completion is for the desired page number.
+ */
+static bool completion_needs_page(struct waiter *waiter, void *context)
+{
+	physical_block_number_t *pbn = context;
+
+	return (page_completion_from_waiter(waiter)->pbn == *pbn);
+}
+
+/**
+ * allocate_free_page() - Allocate a free page to the first completion in the waiting queue, and
+ *                        any other completions that match it in page number.
+ */
+static void allocate_free_page(struct page_info *info)
+{
+	int result;
+	struct waiter *oldest_waiter;
+	physical_block_number_t pbn;
+	struct vdo_page_cache *cache = info->cache;
+
+	assert_on_cache_thread(cache, __func__);
+
+	if (!vdo_has_waiters(&cache->free_waiters)) {
+		if (cache->stats.cache_pressure > 0) {
+			uds_log_info("page cache pressure relieved");
+			WRITE_ONCE(cache->stats.cache_pressure, 0);
+		}
+		return;
+	}
+
+	result = reset_page_info(info);
+	if (result != VDO_SUCCESS) {
+		set_persistent_error(cache, "cannot reset page info", result);
+		return;
+	}
+
+	oldest_waiter = vdo_get_first_waiter(&cache->free_waiters);
+	pbn = page_completion_from_waiter(oldest_waiter)->pbn;
+
+	/*
+	 * Remove all entries which match the page number in question and push them onto the page
+	 * info's wait queue.
+	 */
+	vdo_dequeue_matching_waiters(&cache->free_waiters,
+				     completion_needs_page,
+				     &pbn,
+				     &info->waiting);
+	cache->waiter_count -= vdo_count_waiters(&info->waiting);
+
+	result = launch_page_load(info, pbn);
+	if (result != VDO_SUCCESS)
+		vdo_notify_all_waiters(&info->waiting, complete_waiter_with_error, &result);
+}
+
+/**
+ * discard_a_page() - Begin the process of discarding a page.
+ *
+ * If no page is discardable, increments a count of deferred frees so that the next release of a
+ * page which is no longer busy will kick off another discard cycle. This is an indication that the
+ * cache is not big enough.
+ *
+ * If the selected page is not dirty, immediately allocates the page to the oldest completion
+ * waiting for a free page.
+ */
+static void discard_a_page(struct vdo_page_cache *cache)
+{
+	struct page_info *info = select_lru_page(cache);
+
+	if (info == NULL) {
+		report_cache_pressure(cache);
+		return;
+	}
+
+	if (!is_dirty(info)) {
+		allocate_free_page(info);
+		return;
+	}
+
+	ASSERT_LOG_ONLY(!is_in_flight(info), "page selected for discard is not in flight");
+
+	++cache->discard_count;
+	info->write_status = WRITE_STATUS_DISCARD;
+	launch_page_save(info);
+}
+
+/**
+ * discard_page_for_completion() - Helper used to trigger a discard so that the completion can get
+ *                                 a different page.
+ */
+static void discard_page_for_completion(struct vdo_page_completion *vdo_page_comp)
+{
+	struct vdo_page_cache *cache = vdo_page_comp->cache;
+
+	++cache->waiter_count;
+	vdo_enqueue_waiter(&cache->free_waiters, &vdo_page_comp->waiter);
+	discard_a_page(cache);
+}
+
+/**
+ * discard_page_if_needed() - Helper used to trigger a discard if the cache needs another free
+ *                            page.
+ * @cache: The page cache.
+ */
+static void discard_page_if_needed(struct vdo_page_cache *cache)
+{
+	if (cache->waiter_count > cache->discard_count)
+		discard_a_page(cache);
+}
+
+/**
+ * write_has_finished() - Inform the cache that a write has finished (possibly with an error).
+ * @info: The info structure for the page whose write just completed.
+ *
+ * Return: true if the page write was a discard.
+ */
+static bool write_has_finished(struct page_info *info)
+{
+	bool was_discard = (info->write_status == WRITE_STATUS_DISCARD);
+
+	assert_on_cache_thread(info->cache, __func__);
+	info->cache->outstanding_writes--;
+
+	info->write_status = WRITE_STATUS_NORMAL;
+	return was_discard;
+}
+
+/**
+ * handle_page_write_error() - Handler for page write errors.
+ * @completion: The page write vio.
+ */
+static void handle_page_write_error(struct vdo_completion *completion)
+{
+	int result = completion->result;
+	struct page_info *info = completion->parent;
+	struct vdo_page_cache *cache = info->cache;
+
+	vio_record_metadata_io_error(as_vio(completion));
+
+	/* If we're already read-only, write failures are to be expected. */
+	if (result != VDO_READ_ONLY) {
+		static DEFINE_RATELIMIT_STATE(error_limiter,
+					      DEFAULT_RATELIMIT_INTERVAL,
+					      DEFAULT_RATELIMIT_BURST);
+
+		if (__ratelimit(&error_limiter))
+			uds_log_error("failed to write block map page %llu",
+				      (unsigned long long) info->pbn);
+	}
+
+	set_info_state(info, PS_DIRTY);
+	ADD_ONCE(cache->stats.failed_writes, 1);
+	set_persistent_error(cache, "cannot write page", result);
+
+	if (!write_has_finished(info))
+		discard_page_if_needed(cache);
+
+	check_for_drain_complete(cache->zone);
+}
+
+static void page_is_written_out(struct vdo_completion *completion);
+
+static void write_cache_page_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct page_info *info = vio->completion.parent;
+
+	continue_vio_after_io(vio, page_is_written_out, info->cache->zone->thread_id);
+}
+
+/**
+ * page_is_written_out() - Callback used when a page has been written out.
+ * @completion: The vio which wrote the page. Its parent is a page_info.
+ */
+static void page_is_written_out(struct vdo_completion *completion)
+{
+	bool was_discard, reclaimed;
+	u32 reclamations;
+	struct page_info *info = completion->parent;
+	struct vdo_page_cache *cache = info->cache;
+	struct block_map_page *page = (struct block_map_page *) get_page_buffer(info);
+
+	if (!page->header.initialized) {
+		page->header.initialized = true;
+		submit_metadata_vio(info->vio,
+				    info->pbn,
+				    write_cache_page_endio,
+				    handle_page_write_error,
+				    (REQ_OP_WRITE | REQ_PRIO | REQ_PREFLUSH));
+		return;
+	}
+
+	/* Handle journal updates and torn write protection. */
+	vdo_release_recovery_journal_block_reference(cache->zone->block_map->journal,
+						     info->recovery_lock,
+						     VDO_ZONE_TYPE_LOGICAL,
+						     cache->zone->zone_number);
+	info->recovery_lock = 0;
+	was_discard = write_has_finished(info);
+	reclaimed = (!was_discard || (info->busy > 0) || vdo_has_waiters(&info->waiting));
+
+	set_info_state(info, PS_RESIDENT);
+
+	reclamations = distribute_page_over_queue(info, &info->waiting);
+	ADD_ONCE(cache->stats.reclaimed, reclamations);
+
+	if (was_discard)
+		cache->discard_count--;
+
+	if (reclaimed)
+		discard_page_if_needed(cache);
+	else
+		allocate_free_page(info);
+
+	check_for_drain_complete(cache->zone);
+}
+
+/**
+ * write_pages() - Write the batch of pages which were covered by the layer flush which just
+ *                 completed.
+ * @flush_completion: The flush vio.
+ *
+ * This callback is registered in save_pages().
+ */
+static void write_pages(struct vdo_completion *flush_completion)
+{
+	struct vdo_page_cache *cache = ((struct page_info *) flush_completion->parent)->cache;
+
+	/*
+	 * We need to cache these two values on the stack since in the error case below, it is
+	 * possible for the last page info to cause the page cache to get freed. Hence once we
+	 * launch the last page, it may be unsafe to dereference the cache [VDO-4724].
+	 */
+	bool has_unflushed_pages = (cache->pages_to_flush > 0);
+	page_count_t pages_in_flush = cache->pages_in_flush;
+
+	cache->pages_in_flush = 0;
+	while (pages_in_flush-- > 0) {
+		struct page_info *info =
+			list_first_entry(&cache->outgoing_list, struct page_info, state_entry);
+
+		list_del_init(&info->state_entry);
+		if (vdo_is_read_only(info->cache->zone->block_map->vdo)) {
+			struct vdo_completion *completion = &info->vio->completion;
+
+			vdo_reset_completion(completion);
+			completion->callback = page_is_written_out;
+			completion->error_handler = handle_page_write_error;
+			vdo_fail_completion(completion, VDO_READ_ONLY);
+			continue;
+		}
+		ADD_ONCE(info->cache->stats.pages_saved, 1);
+		submit_metadata_vio(info->vio,
+				    info->pbn,
+				    write_cache_page_endio,
+				    handle_page_write_error,
+				    REQ_OP_WRITE | REQ_PRIO);
+	}
+
+	if (has_unflushed_pages)
+		/*
+		 * If there are unflushed pages, the cache can't have been freed, so this call is
+		 * safe.
+		 */
+		save_pages(cache);
+}
+
+/**
+ * vdo_release_page_completion() - Release a VDO Page Completion.
+ *
+ * The page referenced by this completion (if any) will no longer be held busy by this completion.
+ * If a page becomes discardable and there are completions awaiting free pages then a new round of
+ * page discarding is started.
+ */
+void vdo_release_page_completion(struct vdo_completion *completion)
+{
+	struct page_info *discard_info = NULL;
+	struct vdo_page_completion *page_completion = as_vdo_page_completion(completion);
+	struct vdo_page_cache *cache;
+
+	if (completion->result == VDO_SUCCESS) {
+		if (!validate_completed_page_or_enter_read_only_mode(page_completion, false))
+			return;
+
+		if (--page_completion->info->busy == 0)
+			discard_info = page_completion->info;
+	}
+
+	ASSERT_LOG_ONLY((page_completion->waiter.next_waiter == NULL),
+			"Page being released after leaving all queues");
+
+	page_completion->info = NULL;
+	cache = page_completion->cache;
+	assert_on_cache_thread(cache, __func__);
+
+	if (discard_info != NULL) {
+		if (discard_info->write_status == WRITE_STATUS_DEFERRED) {
+			discard_info->write_status = WRITE_STATUS_NORMAL;
+			launch_page_save(discard_info);
+		}
+		/*
+		 * if there are excess requests for pages (that have not already started discards)
+		 * we need to discard some page (which may be this one)
+		 */
+		discard_page_if_needed(cache);
+	}
+}
+
+/**
+ * load_page_for_completion() - Helper function to load a page as described by a VDO Page
+ *                              Completion.
+ */
+static void
+load_page_for_completion(struct page_info *info, struct vdo_page_completion *vdo_page_comp)
+{
+	int result;
+
+	vdo_enqueue_waiter(&info->waiting, &vdo_page_comp->waiter);
+	result = launch_page_load(info, vdo_page_comp->pbn);
+	if (result != VDO_SUCCESS)
+		vdo_notify_all_waiters(&info->waiting, complete_waiter_with_error, &result);
+}
+
+/**
+ * vdo_get_page() - Initialize a page completion and get a block map page.
+ * @page_completion: The vdo_page_completion to initialize.
+ * @zone: The block map zone of the desired page.
+ * @pbn: The absolute physical block of the desired page.
+ * @writable: Whether the page can be modified.
+ * @parent: The object to notify when the fetch is complete.
+ * @callback: The notification callback.
+ * @error_handler: The handler for fetch errors.
+ * @requeue: Whether we must requeue when notifying the parent.
+ *
+ * May cause another page to be discarded (potentially writing a dirty page) and the one nominated
+ * by the completion to be loaded from disk. When the callback is invoked, the page will be
+ * resident in the cache and marked busy. All callers must call vdo_release_page_completion()
+ * when they are done with the page to clear the busy mark.
+ */
+void vdo_get_page(struct vdo_page_completion *page_completion,
+		  struct block_map_zone *zone,
+		  physical_block_number_t pbn,
+		  bool writable,
+		  void *parent,
+		  vdo_action *callback,
+		  vdo_action *error_handler,
+		  bool requeue)
+{
+	struct vdo_page_cache *cache = &zone->page_cache;
+	struct vdo_completion *completion = &page_completion->completion;
+	struct page_info *info;
+
+	assert_on_cache_thread(cache, __func__);
+	ASSERT_LOG_ONLY((page_completion->waiter.next_waiter == NULL),
+			"New page completion was not already on a wait queue");
+
+	*page_completion = (struct vdo_page_completion) {
+		.pbn = pbn,
+		.writable = writable,
+		.cache = cache,
+	};
+
+	vdo_initialize_completion(completion, cache->vdo, VDO_PAGE_COMPLETION);
+	vdo_prepare_completion(completion,
+			       callback,
+			       error_handler,
+			       cache->zone->thread_id,
+			       parent);
+	completion->requeue = requeue;
+
+	if (page_completion->writable && vdo_is_read_only(cache->zone->block_map->vdo)) {
+		vdo_fail_completion(completion, VDO_READ_ONLY);
+		return;
+	}
+
+	if (page_completion->writable)
+		ADD_ONCE(cache->stats.write_count, 1);
+	else
+		ADD_ONCE(cache->stats.read_count, 1);
+
+	info = find_page(cache, page_completion->pbn);
+	if (info != NULL) {
+		/* The page is in the cache already. */
+		if ((info->write_status == WRITE_STATUS_DEFERRED) ||
+		    is_incoming(info) ||
+		    (is_outgoing(info) && page_completion->writable)) {
+			/* The page is unusable until it has finished I/O. */
+			ADD_ONCE(cache->stats.wait_for_page, 1);
+			vdo_enqueue_waiter(&info->waiting, &page_completion->waiter);
+			return;
+		}
+
+		if (is_valid(info)) {
+			/* The page is usable. */
+			ADD_ONCE(cache->stats.found_in_cache, 1);
+			if (!is_present(info))
+				ADD_ONCE(cache->stats.read_outgoing, 1);
+			update_lru(info);
+			++info->busy;
+			complete_with_page(info, page_completion);
+			return;
+		}
+		/* Something horrible has gone wrong. */
+		ASSERT_LOG_ONLY(false, "Info found in a usable state.");
+	}
+
+	/* The page must be fetched. */
+	info = find_free_page(cache);
+	if (info != NULL) {
+		ADD_ONCE(cache->stats.fetch_required, 1);
+		load_page_for_completion(info, page_completion);
+		return;
+	}
+
+	/* The page must wait for a page to be discarded. */
+	ADD_ONCE(cache->stats.discard_required, 1);
+	discard_page_for_completion(page_completion);
+}
+
+/**
+ * vdo_request_page_write() - Request that a VDO page be written out as soon as it is not busy.
+ * @completion: The vdo_page_completion containing the page.
+ */
+void vdo_request_page_write(struct vdo_completion *completion)
+{
+	struct page_info *info;
+	struct vdo_page_completion *vdo_page_comp = as_vdo_page_completion(completion);
+
+	if (!validate_completed_page_or_enter_read_only_mode(vdo_page_comp, true))
+		return;
+
+	info = vdo_page_comp->info;
+	set_info_state(info, PS_DIRTY);
+	launch_page_save(info);
+}
+
+/**
+ * vdo_get_cached_page() - Get the block map page from a page completion.
+ * @completion: A vdo page completion whose callback has been called.
+ * @page_ptr: A pointer to hold the page
+ *
+ * Return: VDO_SUCCESS or an error
+ */
+int vdo_get_cached_page(struct vdo_completion *completion, struct block_map_page **page_ptr)
+{
+	int result;
+	struct vdo_page_completion *vpc;
+
+	vpc = as_vdo_page_completion(completion);
+	result = validate_completed_page(vpc, true);
+	if (result == VDO_SUCCESS)
+		*page_ptr = (struct block_map_page *) get_page_buffer(vpc->info);
+
+	return result;
+}
+
+/**
+ * vdo_invalidate_page_cache() - Invalidate all entries in the VDO page cache.
+ *
+ * There must not be any dirty pages in the cache.
+ *
+ * Return: A success or error code.
+ */
+int vdo_invalidate_page_cache(struct vdo_page_cache *cache)
+{
+	struct page_info *info;
+
+	assert_on_cache_thread(cache, __func__);
+
+	/* Make sure we don't throw away any dirty pages. */
+	for (info = cache->infos; info < cache->infos + cache->page_count; info++) {
+		int result = ASSERT(!is_dirty(info), "cache must have no dirty pages");
+
+		if (result != VDO_SUCCESS)
+			return result;
+	}
+
+	/* Reset the page map by re-allocating it. */
+	vdo_free_int_map(UDS_FORGET(cache->page_map));
+	return vdo_make_int_map(cache->page_count, 0, &cache->page_map);
+}
+
+/**
+ * get_tree_page_by_index() - Get the tree page for a given height and page index.
+ *
+ * Return: The requested page.
+ */
+static struct tree_page * __must_check
+get_tree_page_by_index(struct forest *forest,
+		       root_count_t root_index,
+		       height_t height,
+		       page_number_t page_index)
+{
+	page_number_t offset = 0;
+	size_t segment;
+
+	for (segment = 0; segment < forest->segments; segment++) {
+		page_number_t border = forest->boundaries[segment].levels[height - 1];
+
+		if (page_index < border) {
+			struct block_map_tree *tree = &forest->trees[root_index];
+
+			return &(tree->segments[segment].levels[height - 1][page_index - offset]);
+		}
+		offset = border;
+	}
+
+	return NULL;
+}
+
+/* Get the page referred to by the lock's tree slot at its current height. */
+static inline struct tree_page *
+get_tree_page(const struct block_map_zone *zone, const struct tree_lock *lock)
+{
+	return get_tree_page_by_index(zone->block_map->forest,
+				      lock->root_index,
+				      lock->height,
+				      lock->tree_slots[lock->height].page_index);
+}
+
+/** vdo_copy_valid_page() - Validate and copy a buffer to a page. */
+bool vdo_copy_valid_page(char *buffer,
+			 nonce_t nonce,
+			 physical_block_number_t pbn,
+			 struct block_map_page *page)
+{
+	struct block_map_page *loaded = (struct block_map_page *) buffer;
+	enum block_map_page_validity validity = vdo_validate_block_map_page(loaded, nonce, pbn);
+
+	if (validity == VDO_BLOCK_MAP_PAGE_VALID) {
+		memcpy(page, loaded, VDO_BLOCK_SIZE);
+		return true;
+	}
+
+	if (validity == VDO_BLOCK_MAP_PAGE_BAD)
+		uds_log_error_strerror(VDO_BAD_PAGE,
+				       "Expected page %llu but got page %llu instead",
+				       (unsigned long long) pbn,
+				       (unsigned long long) vdo_get_block_map_page_pbn(loaded));
+
+	return false;
+}
+
+/**
+ * in_cyclic_range() - Check whether the given value is between the lower and upper bounds, within
+ *                     a cyclic range of values from 0 to (modulus - 1).
+ * @lower: The lowest value to accept.
+ * @value: The value to check.
+ * @upper: The highest value to accept.
+ * @modulus: The size of the cyclic space, no more than 2^15.
+ *
+ * The value and both bounds must be smaller than the modulus.
+ *
+ * Return: true if the value is in range.
+ */
+static bool in_cyclic_range(u16 lower, u16 value, u16 upper, u16 modulus)
+{
+	if (value < lower)
+		value += modulus;
+	if (upper < lower)
+		upper += modulus;
+	return (value <= upper);
+}
+
+/**
+ * is_not_older() - Check whether a generation is strictly older than some other generation in the
+ *                  context of a zone's current generation range.
+ * @zone: The zone in which to do the comparison.
+ * @a: The generation in question.
+ * @b: The generation to compare to.
+ *
+ * Return: true if generation @a is not strictly older than generation @b in the context of @zone
+ */
+static bool __must_check is_not_older(struct block_map_zone *zone, u8 a, u8 b)
+{
+	int result;
+
+	result = ASSERT((in_cyclic_range(zone->oldest_generation, a, zone->generation, 1 << 8) &&
+			 in_cyclic_range(zone->oldest_generation, b, zone->generation, 1 << 8)),
+			"generation(s) %u, %u are out of range [%u, %u]",
+			a, b, zone->oldest_generation, zone->generation);
+	if (result != VDO_SUCCESS) {
+		enter_zone_read_only_mode(zone, result);
+		return true;
+	}
+
+	return in_cyclic_range(b, a, zone->generation, 1 << 8);
+}
+
+static void release_generation(struct block_map_zone *zone, u8 generation)
+{
+	int result;
+
+	result = ASSERT((zone->dirty_page_counts[generation] > 0),
+			"dirty page count underflow for generation %u",
+			generation);
+	if (result != VDO_SUCCESS) {
+		enter_zone_read_only_mode(zone, result);
+		return;
+	}
+
+	zone->dirty_page_counts[generation]--;
+	while ((zone->dirty_page_counts[zone->oldest_generation] == 0) &&
+	       (zone->oldest_generation != zone->generation))
+		zone->oldest_generation++;
+}
+
+static void
+set_generation(struct block_map_zone *zone, struct tree_page *page, u8 new_generation)
+{
+	u32 new_count;
+	int result;
+	bool decrement_old = vdo_is_waiting(&page->waiter);
+	u8 old_generation = page->generation;
+
+	if (decrement_old && (old_generation == new_generation))
+		return;
+
+	page->generation = new_generation;
+	new_count = ++zone->dirty_page_counts[new_generation];
+	result = ASSERT((new_count != 0),
+			"dirty page count overflow for generation %u",
+			new_generation);
+	if (result != VDO_SUCCESS) {
+		enter_zone_read_only_mode(zone, result);
+		return;
+	}
+
+	if (decrement_old)
+		release_generation(zone, old_generation);
+}
+
+static void write_page(struct tree_page *tree_page, struct pooled_vio *vio);
+
+/* Implements waiter_callback */
+static void write_page_callback(struct waiter *waiter, void *context)
+{
+	write_page(container_of(waiter, struct tree_page, waiter), (struct pooled_vio *) context);
+}
+
+static void acquire_vio(struct waiter *waiter, struct block_map_zone *zone)
+{
+	waiter->callback = write_page_callback;
+	acquire_vio_from_pool(zone->vio_pool, waiter);
+}
+
+/* Return: true if all possible generations were not already active */
+static bool attempt_increment(struct block_map_zone *zone)
+{
+	u8 generation = zone->generation + 1;
+
+	if (zone->oldest_generation == generation)
+		return false;
+
+	zone->generation = generation;
+	return true;
+}
+
+/* Launches a flush if one is not already in progress. */
+static void enqueue_page(struct tree_page *page, struct block_map_zone *zone)
+{
+	if ((zone->flusher == NULL) && attempt_increment(zone)) {
+		zone->flusher = page;
+		acquire_vio(&page->waiter, zone);
+		return;
+	}
+
+	vdo_enqueue_waiter(&zone->flush_waiters, &page->waiter);
+}
+
+static void write_page_if_not_dirtied(struct waiter *waiter, void *context)
+{
+	struct tree_page *page = container_of(waiter, struct tree_page, waiter);
+	struct write_if_not_dirtied_context *write_context = context;
+
+	if (page->generation == write_context->generation) {
+		acquire_vio(waiter, write_context->zone);
+		return;
+	}
+
+	enqueue_page(page, write_context->zone);
+}
+
+static void return_to_pool(struct block_map_zone *zone, struct pooled_vio *vio)
+{
+	return_vio_to_pool(zone->vio_pool, vio);
+	check_for_drain_complete(zone);
+}
+
+/* This callback is registered in write_initialized_page(). */
+static void finish_page_write(struct vdo_completion *completion)
+{
+	bool dirty;
+	struct vio *vio = as_vio(completion);
+	struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio);
+	struct tree_page *page = completion->parent;
+	struct block_map_zone *zone = pooled->context;
+
+	vdo_release_recovery_journal_block_reference(zone->block_map->journal,
+						     page->writing_recovery_lock,
+						     VDO_ZONE_TYPE_LOGICAL,
+						     zone->zone_number);
+
+	dirty = (page->writing_generation != page->generation);
+	release_generation(zone, page->writing_generation);
+	page->writing = false;
+
+	if (zone->flusher == page) {
+		struct write_if_not_dirtied_context context = {
+			.zone = zone,
+			.generation = page->writing_generation,
+		};
+
+		vdo_notify_all_waiters(&zone->flush_waiters, write_page_if_not_dirtied, &context);
+		if (dirty && attempt_increment(zone)) {
+			write_page(page, pooled);
+			return;
+		}
+
+		zone->flusher = NULL;
+	}
+
+	if (dirty) {
+		enqueue_page(page, zone);
+	} else if ((zone->flusher == NULL) &&
+		   vdo_has_waiters(&zone->flush_waiters) &&
+		   attempt_increment(zone)) {
+		zone->flusher = container_of(vdo_dequeue_next_waiter(&zone->flush_waiters),
+					     struct tree_page,
+					     waiter);
+		write_page(zone->flusher, pooled);
+		return;
+	}
+
+	return_to_pool(zone, pooled);
+}
+
+static void handle_write_error(struct vdo_completion *completion)
+{
+	int result = completion->result;
+	struct vio *vio = as_vio(completion);
+	struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio);
+	struct block_map_zone *zone = pooled->context;
+
+	vio_record_metadata_io_error(vio);
+	enter_zone_read_only_mode(zone, result);
+	return_to_pool(zone, pooled);
+}
+
+static void write_page_endio(struct bio *bio);
+
+static void write_initialized_page(struct vdo_completion *completion)
+{
+	struct vio *vio = as_vio(completion);
+	struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio);
+	struct block_map_zone *zone = pooled->context;
+	struct tree_page *tree_page = completion->parent;
+	struct block_map_page *page = (struct block_map_page *) vio->data;
+	unsigned int operation = REQ_OP_WRITE | REQ_PRIO;
+
+	/*
+	 * Now that we know the page has been written at least once, mark the copy we are writing
+	 * as initialized.
+	 */
+	page->header.initialized = true;
+
+	if (zone->flusher == tree_page)
+		operation |= REQ_PREFLUSH;
+
+	submit_metadata_vio(vio,
+			    vdo_get_block_map_page_pbn(page),
+			    write_page_endio,
+			    handle_write_error,
+			    operation);
+}
+
+static void write_page_endio(struct bio *bio)
+{
+	struct pooled_vio *vio = bio->bi_private;
+	struct block_map_zone *zone = vio->context;
+	struct block_map_page *page = (struct block_map_page *) vio->vio.data;
+
+	continue_vio_after_io(&vio->vio,
+			      (page->header.initialized ?
+			       finish_page_write :
+			       write_initialized_page),
+			      zone->thread_id);
+}
+
+static void write_page(struct tree_page *tree_page, struct pooled_vio *vio)
+{
+	struct vdo_completion *completion = &vio->vio.completion;
+	struct block_map_zone *zone = vio->context;
+	struct block_map_page *page = vdo_as_block_map_page(tree_page);
+
+	if ((zone->flusher != tree_page) &&
+	    is_not_older(zone, tree_page->generation, zone->generation)) {
+		/*
+		 * This page was re-dirtied after the last flush was issued, hence we need to do
+		 * another flush.
+		 */
+		enqueue_page(tree_page, zone);
+		return_to_pool(zone, vio);
+		return;
+	}
+
+	completion->parent = tree_page;
+	memcpy(vio->vio.data, tree_page->page_buffer, VDO_BLOCK_SIZE);
+	completion->callback_thread_id = zone->thread_id;
+
+	tree_page->writing = true;
+	tree_page->writing_generation = tree_page->generation;
+	tree_page->writing_recovery_lock = tree_page->recovery_lock;
+
+	/* Clear this now so that we know this page is not on any dirty list. */
+	tree_page->recovery_lock = 0;
+
+	/*
+	 * We've already copied the page into the vio which will write it, so if it was not yet
+	 * initialized, the first write will indicate that (for torn write protection). It is now
+	 * safe to mark it as initialized in memory since if the write fails, the in memory state
+	 * will become irrelevant.
+	 */
+	if (page->header.initialized) {
+		write_initialized_page(completion);
+		return;
+	}
+
+	page->header.initialized = true;
+	submit_metadata_vio(&vio->vio,
+			    vdo_get_block_map_page_pbn(page),
+			    write_page_endio,
+			    handle_write_error,
+			    REQ_OP_WRITE | REQ_PRIO);
+}
+
+/* Release a lock on a page which was being loaded or allocated. */
+static void release_page_lock(struct data_vio *data_vio, char *what)
+{
+	struct block_map_zone *zone;
+	struct tree_lock *lock_holder;
+	struct tree_lock *lock = &data_vio->tree_lock;
+
+	ASSERT_LOG_ONLY(lock->locked,
+			"release of unlocked block map page %s for key %llu in tree %u",
+			what, (unsigned long long) lock->key,
+			lock->root_index);
+
+	zone = data_vio->logical.zone->block_map_zone;
+	lock_holder = vdo_int_map_remove(zone->loading_pages, lock->key);
+	ASSERT_LOG_ONLY((lock_holder == lock),
+			"block map page %s mismatch for key %llu in tree %u",
+			what,
+			(unsigned long long) lock->key,
+			lock->root_index);
+	lock->locked = false;
+}
+
+static void finish_lookup(struct data_vio *data_vio, int result)
+{
+	data_vio->tree_lock.height = 0;
+
+	--data_vio->logical.zone->block_map_zone->active_lookups;
+
+	set_data_vio_logical_callback(data_vio, continue_data_vio_with_block_map_slot);
+	data_vio->vio.completion.error_handler = handle_data_vio_error;
+	continue_data_vio_with_error(data_vio, result);
+}
+
+static void abort_lookup_for_waiter(struct waiter *waiter, void *context)
+{
+	struct data_vio *data_vio = waiter_as_data_vio(waiter);
+	int result = *((int *) context);
+
+	if (!data_vio->write) {
+		if (result == VDO_NO_SPACE)
+			result = VDO_SUCCESS;
+	} else if (result != VDO_NO_SPACE) {
+		result = VDO_READ_ONLY;
+	}
+
+	finish_lookup(data_vio, result);
+}
+
+static void abort_lookup(struct data_vio *data_vio, int result, char *what)
+{
+	if (result != VDO_NO_SPACE)
+		enter_zone_read_only_mode(data_vio->logical.zone->block_map_zone, result);
+
+	if (data_vio->tree_lock.locked) {
+		release_page_lock(data_vio, what);
+		vdo_notify_all_waiters(&data_vio->tree_lock.waiters,
+				       abort_lookup_for_waiter,
+				       &result);
+	}
+
+	finish_lookup(data_vio, result);
+}
+
+static void abort_load(struct data_vio *data_vio, int result)
+{
+	abort_lookup(data_vio, result, "load");
+}
+
+static bool __must_check
+is_invalid_tree_entry(const struct vdo *vdo, const struct data_location *mapping, height_t height)
+{
+	if (!vdo_is_valid_location(mapping) ||
+	    vdo_is_state_compressed(mapping->state) ||
+	    (vdo_is_mapped_location(mapping) && (mapping->pbn == VDO_ZERO_BLOCK)))
+		return true;
+
+	/* Roots aren't physical data blocks, so we can't check their PBNs. */
+	if (height == VDO_BLOCK_MAP_TREE_HEIGHT)
+		return false;
+
+	return !vdo_is_physical_data_block(vdo->depot, mapping->pbn);
+}
+
+static void load_block_map_page(struct block_map_zone *zone, struct data_vio *data_vio);
+static void allocate_block_map_page(struct block_map_zone *zone, struct data_vio *data_vio);
+
+static void continue_with_loaded_page(struct data_vio *data_vio, struct block_map_page *page)
+{
+	struct tree_lock *lock = &data_vio->tree_lock;
+	struct block_map_tree_slot slot = lock->tree_slots[lock->height];
+	struct data_location mapping =
+		vdo_unpack_block_map_entry(&page->entries[slot.block_map_slot.slot]);
+
+	if (is_invalid_tree_entry(vdo_from_data_vio(data_vio), &mapping, lock->height)) {
+		uds_log_error_strerror(VDO_BAD_MAPPING,
+				       "Invalid block map tree PBN: %llu with state %u for page index %u at height %u",
+				       (unsigned long long) mapping.pbn,
+				       mapping.state,
+				       lock->tree_slots[lock->height - 1].page_index,
+				       lock->height - 1);
+		abort_load(data_vio, VDO_BAD_MAPPING);
+		return;
+	}
+
+	if (!vdo_is_mapped_location(&mapping)) {
+		/* The page we need is unallocated */
+		allocate_block_map_page(data_vio->logical.zone->block_map_zone, data_vio);
+		return;
+	}
+
+	lock->tree_slots[lock->height - 1].block_map_slot.pbn = mapping.pbn;
+	if (lock->height == 1) {
+		finish_lookup(data_vio, VDO_SUCCESS);
+		return;
+	}
+
+	/* We know what page we need to load next */
+	load_block_map_page(data_vio->logical.zone->block_map_zone, data_vio);
+}
+
+static void continue_load_for_waiter(struct waiter *waiter, void *context)
+{
+	struct data_vio *data_vio = waiter_as_data_vio(waiter);
+
+	data_vio->tree_lock.height--;
+	continue_with_loaded_page(data_vio, (struct block_map_page *) context);
+}
+
+static void finish_block_map_page_load(struct vdo_completion *completion)
+{
+	physical_block_number_t pbn;
+	struct tree_page *tree_page;
+	struct block_map_page *page;
+	nonce_t nonce;
+	struct vio *vio = as_vio(completion);
+	struct pooled_vio *pooled = vio_as_pooled_vio(vio);
+	struct data_vio *data_vio = completion->parent;
+	struct block_map_zone *zone = pooled->context;
+	struct tree_lock *tree_lock = &data_vio->tree_lock;
+
+	tree_lock->height--;
+	pbn = tree_lock->tree_slots[tree_lock->height].block_map_slot.pbn;
+	tree_page = get_tree_page(zone, tree_lock);
+	page = (struct block_map_page *) tree_page->page_buffer;
+	nonce = zone->block_map->nonce;
+
+	if (!vdo_copy_valid_page(vio->data, nonce, pbn, page))
+		vdo_format_block_map_page(page, nonce, pbn, false);
+	return_vio_to_pool(zone->vio_pool, pooled);
+
+	/* Release our claim to the load and wake any waiters */
+	release_page_lock(data_vio, "load");
+	vdo_notify_all_waiters(&tree_lock->waiters, continue_load_for_waiter, page);
+	continue_with_loaded_page(data_vio, page);
+}
+
+static void handle_io_error(struct vdo_completion *completion)
+{
+	int result = completion->result;
+	struct vio *vio = as_vio(completion);
+	struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio);
+	struct data_vio *data_vio = completion->parent;
+	struct block_map_zone *zone = pooled->context;
+
+	vio_record_metadata_io_error(vio);
+	return_vio_to_pool(zone->vio_pool, pooled);
+	abort_load(data_vio, result);
+}
+
+static void load_page_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct data_vio *data_vio = vio->completion.parent;
+
+	continue_vio_after_io(vio, finish_block_map_page_load, data_vio->logical.zone->thread_id);
+}
+
+static void load_page(struct waiter *waiter, void *context)
+{
+	struct pooled_vio *pooled = context;
+	struct data_vio *data_vio = waiter_as_data_vio(waiter);
+	struct tree_lock *lock = &data_vio->tree_lock;
+	physical_block_number_t pbn = lock->tree_slots[lock->height - 1].block_map_slot.pbn;
+
+	pooled->vio.completion.parent = data_vio;
+	submit_metadata_vio(&pooled->vio,
+			    pbn,
+			    load_page_endio,
+			    handle_io_error,
+			    REQ_OP_READ | REQ_PRIO);
+}
+
+/*
+ * If the page is already locked, queue up to wait for the lock to be released. If the lock is
+ * acquired, @data_vio->tree_lock.locked will be true.
+ */
+static int attempt_page_lock(struct block_map_zone *zone, struct data_vio *data_vio)
+{
+	int result;
+	struct tree_lock *lock_holder;
+	struct tree_lock *lock = &data_vio->tree_lock;
+	height_t height = lock->height;
+	struct block_map_tree_slot tree_slot = lock->tree_slots[height];
+	union page_key key;
+
+	key.descriptor = (struct page_descriptor) {
+		.root_index = lock->root_index,
+		.height = height,
+		.page_index = tree_slot.page_index,
+		.slot = tree_slot.block_map_slot.slot,
+	};
+	lock->key = key.key;
+
+	result = vdo_int_map_put(zone->loading_pages,
+				 lock->key,
+				 lock,
+				 false,
+				 (void **) &lock_holder);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	if (lock_holder == NULL) {
+		/* We got the lock */
+		data_vio->tree_lock.locked = true;
+		return VDO_SUCCESS;
+	}
+
+	/* Someone else is loading or allocating the page we need */
+	vdo_enqueue_waiter(&lock_holder->waiters, &data_vio->waiter);
+	return VDO_SUCCESS;
+}
+
+/* Load a block map tree page from disk, for the next level in the data vio tree lock. */
+static void load_block_map_page(struct block_map_zone *zone, struct data_vio *data_vio)
+{
+	int result;
+
+	result = attempt_page_lock(zone, data_vio);
+	if (result != VDO_SUCCESS) {
+		abort_load(data_vio, result);
+		return;
+	}
+
+	if (data_vio->tree_lock.locked) {
+		data_vio->waiter.callback = load_page;
+		acquire_vio_from_pool(zone->vio_pool, &data_vio->waiter);
+	}
+}
+
+static void allocation_failure(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+
+	if (vdo_requeue_completion_if_needed(completion, data_vio->logical.zone->thread_id))
+		return;
+
+	abort_lookup(data_vio, completion->result, "allocation");
+}
+
+static void continue_allocation_for_waiter(struct waiter *waiter, void *context)
+{
+	struct data_vio *data_vio = waiter_as_data_vio(waiter);
+	struct tree_lock *tree_lock = &data_vio->tree_lock;
+	physical_block_number_t pbn = *((physical_block_number_t *) context);
+
+	tree_lock->height--;
+	data_vio->tree_lock.tree_slots[tree_lock->height].block_map_slot.pbn = pbn;
+
+	if (tree_lock->height == 0) {
+		finish_lookup(data_vio, VDO_SUCCESS);
+		return;
+	}
+
+	allocate_block_map_page(data_vio->logical.zone->block_map_zone, data_vio);
+}
+
+/** expire_oldest_list() - Expire the oldest list. */
+static void expire_oldest_list(struct dirty_lists *dirty_lists)
+{
+	block_count_t i = dirty_lists->offset++;
+
+	dirty_lists->oldest_period++;
+	if (!list_empty(&dirty_lists->eras[i][VDO_TREE_PAGE]))
+		list_splice_tail_init(&dirty_lists->eras[i][VDO_TREE_PAGE],
+				      &dirty_lists->expired[VDO_TREE_PAGE]);
+	if (!list_empty(&dirty_lists->eras[i][VDO_CACHE_PAGE]))
+		list_splice_tail_init(&dirty_lists->eras[i][VDO_CACHE_PAGE],
+				      &dirty_lists->expired[VDO_CACHE_PAGE]);
+
+	if (dirty_lists->offset == dirty_lists->maximum_age)
+		dirty_lists->offset = 0;
+}
+
+
+/** update_period() - Update the dirty_lists period if necessary. */
+static void update_period(struct dirty_lists *dirty, sequence_number_t period)
+{
+	while (dirty->next_period <= period) {
+		if ((dirty->next_period - dirty->oldest_period) == dirty->maximum_age)
+			expire_oldest_list(dirty);
+		dirty->next_period++;
+	}
+}
+
+/** write_expired_elements() - Write out the expired list. */
+static void write_expired_elements(struct block_map_zone *zone)
+{
+	struct tree_page *page, *ttmp;
+	struct page_info *info, *ptmp;
+	struct list_head *expired;
+	u8 generation = zone->generation;
+
+	expired = &zone->dirty_lists->expired[VDO_TREE_PAGE];
+	list_for_each_entry_safe(page, ttmp, expired, entry) {
+		int result;
+
+		list_del_init(&page->entry);
+
+		result = ASSERT(!vdo_is_waiting(&page->waiter),
+				"Newly expired page not already waiting to write");
+		if (result != VDO_SUCCESS) {
+			enter_zone_read_only_mode(zone, result);
+			continue;
+		}
+
+		set_generation(zone, page, generation);
+		if (!page->writing)
+			enqueue_page(page, zone);
+	}
+
+	expired = &zone->dirty_lists->expired[VDO_CACHE_PAGE];
+	list_for_each_entry_safe(info, ptmp, expired, state_entry) {
+		list_del_init(&info->state_entry);
+		schedule_page_save(info);
+	}
+
+	save_pages(&zone->page_cache);
+}
+
+/**
+ * add_to_dirty_lists() - Add an element to the dirty lists.
+ * @zone: The zone in which we are operating.
+ * @entry: The list entry of the element to add.
+ * @type: The type of page.
+ * @old_period: The period in which the element was previously dirtied, or 0 if it was not dirty.
+ * @new_period: The period in which the element has now been dirtied, or 0 if it does not hold a
+ *              lock.
+ */
+static void
+add_to_dirty_lists(struct block_map_zone *zone,
+		   struct list_head *entry,
+		   enum block_map_page_type type,
+		   sequence_number_t old_period,
+		   sequence_number_t new_period)
+{
+	struct dirty_lists *dirty_lists = zone->dirty_lists;
+
+	if ((old_period == new_period) || ((old_period != 0) && (old_period < new_period)))
+		return;
+
+	if (new_period < dirty_lists->oldest_period) {
+		list_move_tail(entry, &dirty_lists->expired[type]);
+	} else {
+		update_period(dirty_lists, new_period);
+		list_move_tail(entry,
+			       &dirty_lists->eras[new_period % dirty_lists->maximum_age][type]);
+	}
+
+	write_expired_elements(zone);
+}
+
+/*
+ * Record the allocation in the tree and wake any waiters now that the write lock has been
+ * released.
+ */
+static void finish_block_map_allocation(struct vdo_completion *completion)
+{
+	physical_block_number_t pbn;
+	struct tree_page *tree_page;
+	struct block_map_page *page;
+	sequence_number_t old_lock;
+	struct data_vio *data_vio = as_data_vio(completion);
+	struct block_map_zone *zone = data_vio->logical.zone->block_map_zone;
+	struct tree_lock *tree_lock = &data_vio->tree_lock;
+	height_t height = tree_lock->height;
+
+	assert_data_vio_in_logical_zone(data_vio);
+
+	tree_page = get_tree_page(zone, tree_lock);
+	pbn = tree_lock->tree_slots[height - 1].block_map_slot.pbn;
+
+	/* Record the allocation. */
+	page = (struct block_map_page *) tree_page->page_buffer;
+	old_lock = tree_page->recovery_lock;
+	vdo_update_block_map_page(page,
+				  data_vio,
+				  pbn,
+				  VDO_MAPPING_STATE_UNCOMPRESSED,
+				  &tree_page->recovery_lock);
+
+	if (vdo_is_waiting(&tree_page->waiter)) {
+		/* This page is waiting to be written out. */
+		if (zone->flusher != tree_page)
+			/*
+			 * The outstanding flush won't cover the update we just made, so mark the
+			 * page as needing another flush.
+			 */
+			set_generation(zone, tree_page, zone->generation);
+	} else {
+		/* Put the page on a dirty list */
+		if (old_lock == 0)
+			INIT_LIST_HEAD(&tree_page->entry);
+		add_to_dirty_lists(zone,
+				   &tree_page->entry,
+				   VDO_TREE_PAGE,
+				   old_lock,
+				   tree_page->recovery_lock);
+	}
+
+	tree_lock->height--;
+	if (height > 1) {
+		/* Format the interior node we just allocated (in memory). */
+		tree_page = get_tree_page(zone, tree_lock);
+		vdo_format_block_map_page(tree_page->page_buffer,
+					  zone->block_map->nonce,
+					  pbn,
+					  false);
+	}
+
+	/* Release our claim to the allocation and wake any waiters */
+	release_page_lock(data_vio, "allocation");
+	vdo_notify_all_waiters(&tree_lock->waiters, continue_allocation_for_waiter, &pbn);
+	if (tree_lock->height == 0) {
+		finish_lookup(data_vio, VDO_SUCCESS);
+		return;
+	}
+
+	allocate_block_map_page(zone, data_vio);
+}
+
+static void release_block_map_write_lock(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+
+	assert_data_vio_in_allocated_zone(data_vio);
+
+	release_data_vio_allocation_lock(data_vio, true);
+	launch_data_vio_logical_callback(data_vio, finish_block_map_allocation);
+}
+
+/*
+ * Newly allocated block map pages are set to have to MAXIMUM_REFERENCES after they are journaled,
+ * to prevent deduplication against the block after we release the write lock on it, but before we
+ * write out the page.
+ */
+static void set_block_map_page_reference_count(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+
+	assert_data_vio_in_allocated_zone(data_vio);
+
+	completion->callback = release_block_map_write_lock;
+	vdo_modify_reference_count(completion, &data_vio->increment_updater);
+}
+
+static void journal_block_map_allocation(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+
+	assert_data_vio_in_journal_zone(data_vio);
+
+	set_data_vio_allocated_zone_callback(data_vio, set_block_map_page_reference_count);
+	vdo_add_recovery_journal_entry(completion->vdo->recovery_journal, data_vio);
+}
+
+static void allocate_block(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+	struct tree_lock *lock = &data_vio->tree_lock;
+	physical_block_number_t pbn;
+
+	assert_data_vio_in_allocated_zone(data_vio);
+
+	if (!vdo_allocate_block_in_zone(data_vio))
+		return;
+
+	pbn = data_vio->allocation.pbn;
+	lock->tree_slots[lock->height - 1].block_map_slot.pbn = pbn;
+	data_vio->increment_updater = (struct reference_updater) {
+		.operation = VDO_JOURNAL_BLOCK_MAP_REMAPPING,
+		.increment = true,
+		.zpbn = {
+			.pbn = pbn,
+			.state = VDO_MAPPING_STATE_UNCOMPRESSED,
+		},
+		.lock = data_vio->allocation.lock,
+	};
+
+	launch_data_vio_journal_callback(data_vio, journal_block_map_allocation);
+}
+
+static void allocate_block_map_page(struct block_map_zone *zone, struct data_vio *data_vio)
+{
+	int result;
+
+	if (!data_vio->write || data_vio->is_trim) {
+		/* This is a pure read or a trim, so there's nothing left to do here. */
+		finish_lookup(data_vio, VDO_SUCCESS);
+		return;
+	}
+
+	result = attempt_page_lock(zone, data_vio);
+	if (result != VDO_SUCCESS) {
+		abort_lookup(data_vio, result, "allocation");
+		return;
+	}
+
+	if (!data_vio->tree_lock.locked)
+		return;
+
+	data_vio_allocate_data_block(data_vio,
+				     VIO_BLOCK_MAP_WRITE_LOCK,
+				     allocate_block,
+				     allocation_failure);
+}
+
+/*
+ * vdo_find_block_map_slot(): Find the block map slot in which the block map entry for a data_vio
+ *                            resides and cache that result in the data_vio.
+ *
+ * All ancestors in the tree will be allocated or loaded, as needed.
+ */
+void vdo_find_block_map_slot(struct data_vio *data_vio)
+{
+	page_number_t page_index;
+	struct block_map_tree_slot tree_slot;
+	struct data_location mapping;
+	struct block_map_page *page = NULL;
+	struct tree_lock *lock = &data_vio->tree_lock;
+	struct block_map_zone *zone = data_vio->logical.zone->block_map_zone;
+
+	zone->active_lookups++;
+	if (vdo_is_state_draining(&zone->state)) {
+		finish_lookup(data_vio, VDO_SHUTTING_DOWN);
+		return;
+	}
+
+	lock->tree_slots[0].block_map_slot.slot =
+		data_vio->logical.lbn % VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
+	page_index = (lock->tree_slots[0].page_index / zone->block_map->root_count);
+	tree_slot = (struct block_map_tree_slot) {
+		.page_index = page_index / VDO_BLOCK_MAP_ENTRIES_PER_PAGE,
+		.block_map_slot = {
+			.pbn = 0,
+			.slot = page_index % VDO_BLOCK_MAP_ENTRIES_PER_PAGE,
+		},
+	};
+
+	for (lock->height = 1; lock->height <= VDO_BLOCK_MAP_TREE_HEIGHT; lock->height++) {
+		physical_block_number_t pbn;
+
+		lock->tree_slots[lock->height] = tree_slot;
+		page = (struct block_map_page *) (get_tree_page(zone, lock)->page_buffer);
+		pbn = vdo_get_block_map_page_pbn(page);
+		if (pbn != VDO_ZERO_BLOCK) {
+			lock->tree_slots[lock->height].block_map_slot.pbn = pbn;
+			break;
+		}
+
+		/* Calculate the index and slot for the next level. */
+		tree_slot.block_map_slot.slot =
+			tree_slot.page_index % VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
+		tree_slot.page_index = tree_slot.page_index / VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
+	}
+
+	/* The page at this height has been allocated and loaded. */
+	mapping = vdo_unpack_block_map_entry(&page->entries[tree_slot.block_map_slot.slot]);
+	if (is_invalid_tree_entry(vdo_from_data_vio(data_vio), &mapping, lock->height)) {
+		uds_log_error_strerror(VDO_BAD_MAPPING,
+				       "Invalid block map tree PBN: %llu with state %u for page index %u at height %u",
+				       (unsigned long long) mapping.pbn,
+				       mapping.state,
+				       lock->tree_slots[lock->height - 1].page_index,
+				       lock->height - 1);
+		abort_load(data_vio, VDO_BAD_MAPPING);
+		return;
+	}
+
+	if (!vdo_is_mapped_location(&mapping)) {
+		/* The page we want one level down has not been allocated, so allocate it. */
+		allocate_block_map_page(zone, data_vio);
+		return;
+	}
+
+	lock->tree_slots[lock->height - 1].block_map_slot.pbn = mapping.pbn;
+	if (lock->height == 1) {
+		/* This is the ultimate block map page, so we're done */
+		finish_lookup(data_vio, VDO_SUCCESS);
+		return;
+	}
+
+	/* We know what page we need to load. */
+	load_block_map_page(zone, data_vio);
+}
+
+/*
+ * Find the PBN of a leaf block map page. This method may only be used after all allocated tree
+ * pages have been loaded, otherwise, it may give the wrong answer (0).
+ */
+physical_block_number_t
+vdo_find_block_map_page_pbn(struct block_map *map, page_number_t page_number)
+{
+	struct data_location mapping;
+	struct tree_page *tree_page;
+	struct block_map_page *page;
+	root_count_t root_index = page_number % map->root_count;
+	page_number_t page_index = page_number / map->root_count;
+	slot_number_t slot = page_index % VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
+
+	page_index /= VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
+
+	tree_page = get_tree_page_by_index(map->forest, root_index, 1, page_index);
+	page = (struct block_map_page *) tree_page->page_buffer;
+	if (!page->header.initialized)
+		return VDO_ZERO_BLOCK;
+
+	mapping = vdo_unpack_block_map_entry(&page->entries[slot]);
+	if (!vdo_is_valid_location(&mapping) || vdo_is_state_compressed(mapping.state))
+		return VDO_ZERO_BLOCK;
+	return mapping.pbn;
+}
+
+/*
+ * Write a tree page or indicate that it has been re-dirtied if it is already being written. This
+ * method is used when correcting errors in the tree during read-only rebuild.
+ */
+void vdo_write_tree_page(struct tree_page *page, struct block_map_zone *zone)
+{
+	bool waiting = vdo_is_waiting(&page->waiter);
+
+	if (waiting && (zone->flusher == page))
+		return;
+
+	set_generation(zone, page, zone->generation);
+	if (waiting || page->writing)
+		return;
+
+	enqueue_page(page, zone);
+}
+
+static int make_segment(struct forest *old_forest,
+			block_count_t new_pages,
+			struct boundary *new_boundary,
+			struct forest *forest)
+{
+	size_t index = (old_forest == NULL) ? 0 : old_forest->segments;
+	struct tree_page *page_ptr;
+	page_count_t segment_sizes[VDO_BLOCK_MAP_TREE_HEIGHT];
+	height_t height;
+	root_count_t root;
+	int result;
+
+	forest->segments = index + 1;
+
+	result = UDS_ALLOCATE(forest->segments, struct boundary,
+			      "forest boundary array", &forest->boundaries);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = UDS_ALLOCATE(forest->segments,
+			      struct tree_page *,
+			      "forest page pointers",
+			      &forest->pages);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = UDS_ALLOCATE(new_pages,
+			      struct tree_page,
+			      "new forest pages",
+			      &forest->pages[index]);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	if (index > 0) {
+		memcpy(forest->boundaries,
+		       old_forest->boundaries,
+		       index * sizeof(struct boundary));
+		memcpy(forest->pages, old_forest->pages, index * sizeof(struct tree_page *));
+	}
+
+	memcpy(&(forest->boundaries[index]), new_boundary, sizeof(struct boundary));
+
+	for (height = 0; height < VDO_BLOCK_MAP_TREE_HEIGHT; height++) {
+		segment_sizes[height] = new_boundary->levels[height];
+		if (index > 0)
+			segment_sizes[height] -= old_forest->boundaries[index - 1].levels[height];
+	}
+
+	page_ptr = forest->pages[index];
+	for (root = 0; root < forest->map->root_count; root++) {
+		struct block_map_tree_segment *segment;
+		struct block_map_tree *tree = &(forest->trees[root]);
+		height_t height;
+
+		int result = UDS_ALLOCATE(forest->segments,
+					  struct block_map_tree_segment,
+					  "tree root segments",
+					  &tree->segments);
+		if (result != VDO_SUCCESS)
+			return result;
+
+		if (index > 0)
+			memcpy(tree->segments,
+			       old_forest->trees[root].segments,
+			       index * sizeof(struct block_map_tree_segment));
+
+		segment = &(tree->segments[index]);
+		for (height = 0; height < VDO_BLOCK_MAP_TREE_HEIGHT; height++) {
+			if (segment_sizes[height] == 0)
+				continue;
+
+			segment->levels[height] = page_ptr;
+			if (height == (VDO_BLOCK_MAP_TREE_HEIGHT - 1)) {
+				/* Record the root. */
+				struct block_map_page *page =
+					vdo_format_block_map_page(page_ptr->page_buffer,
+								  forest->map->nonce,
+								  VDO_INVALID_PBN,
+								  true);
+				page->entries[0] =
+					vdo_pack_block_map_entry(forest->map->root_origin + root,
+								 VDO_MAPPING_STATE_UNCOMPRESSED);
+			}
+			page_ptr += segment_sizes[height];
+		}
+	}
+
+	return VDO_SUCCESS;
+}
+
+static void deforest(struct forest *forest, size_t first_page_segment)
+{
+	root_count_t root;
+
+	if (forest->pages != NULL) {
+		size_t segment;
+
+		for (segment = first_page_segment; segment < forest->segments; segment++)
+			UDS_FREE(forest->pages[segment]);
+		UDS_FREE(forest->pages);
+	}
+
+	for (root = 0; root < forest->map->root_count; root++)
+		UDS_FREE(forest->trees[root].segments);
+
+	UDS_FREE(forest->boundaries);
+	UDS_FREE(forest);
+}
+
+/**
+ * make_forest() - Make a collection of trees for a block_map, expanding the existing forest if
+ *                 there is one.
+ * @entries: The number of entries the block map will hold.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int make_forest(struct block_map *map, block_count_t entries)
+{
+	struct forest *forest, *old_forest = map->forest;
+	struct boundary new_boundary, *old_boundary = NULL;
+	block_count_t new_pages;
+	int result;
+
+	if (old_forest != NULL)
+		old_boundary = &(old_forest->boundaries[old_forest->segments - 1]);
+
+	new_pages = vdo_compute_new_forest_pages(map->root_count, old_boundary,
+						 entries, &new_boundary);
+	if (new_pages == 0) {
+		map->next_entry_count = entries;
+		return VDO_SUCCESS;
+	}
+
+	result = UDS_ALLOCATE_EXTENDED(struct forest, map->root_count,
+				       struct block_map_tree, __func__,
+				       &forest);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	forest->map = map;
+	result = make_segment(old_forest, new_pages, &new_boundary, forest);
+	if (result != VDO_SUCCESS) {
+		deforest(forest, forest->segments - 1);
+		return result;
+	}
+
+	map->next_forest = forest;
+	map->next_entry_count = entries;
+	return VDO_SUCCESS;
+}
+
+/**
+ * replace_forest() - Replace a block_map's forest with the already-prepared larger forest.
+ */
+static void replace_forest(struct block_map *map)
+{
+	if (map->next_forest != NULL) {
+		if (map->forest != NULL)
+			deforest(map->forest, map->forest->segments);
+		map->forest = UDS_FORGET(map->next_forest);
+	}
+
+	map->entry_count = map->next_entry_count;
+	map->next_entry_count = 0;
+}
+
+/**
+ * finish_cursor() - Finish the traversal of a single tree. If it was the last cursor, finish the
+ *                   traversal.
+ */
+static void finish_cursor(struct cursor *cursor)
+{
+	struct cursors *cursors = cursor->parent;
+	struct vdo_completion *parent = cursors->parent;
+
+	return_vio_to_pool(cursors->pool, UDS_FORGET(cursor->vio));
+	if (--cursors->active_roots > 0)
+		return;
+
+	UDS_FREE(cursors);
+
+	vdo_finish_completion(parent);
+}
+
+static void traverse(struct cursor *cursor);
+
+/**
+ * continue_traversal() - Continue traversing a block map tree.
+ * @completion: The VIO doing a read or write.
+ */
+static void continue_traversal(struct vdo_completion *completion)
+{
+	vio_record_metadata_io_error(as_vio(completion));
+	traverse(completion->parent);
+}
+
+/**
+ * finish_traversal_load() - Continue traversing a block map tree now that a page has been loaded.
+ * @completion: The VIO doing the read.
+ */
+static void finish_traversal_load(struct vdo_completion *completion)
+{
+	struct cursor *cursor = completion->parent;
+	height_t height = cursor->height;
+	struct cursor_level *level = &cursor->levels[height];
+	struct tree_page *tree_page =
+		&(cursor->tree->segments[0].levels[height][level->page_index]);
+	struct block_map_page *page = (struct block_map_page *) tree_page->page_buffer;
+
+	vdo_copy_valid_page(cursor->vio->vio.data,
+			    cursor->parent->zone->block_map->nonce,
+			    pbn_from_vio_bio(cursor->vio->vio.bio),
+			    page);
+	traverse(cursor);
+}
+
+static void traversal_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct cursor *cursor = vio->completion.parent;
+
+	continue_vio_after_io(vio, finish_traversal_load, cursor->parent->zone->thread_id);
+}
+
+/**
+ * traverse() - Traverse a single block map tree.
+ *
+ * This is the recursive heart of the traversal process.
+ */
+static void traverse(struct cursor *cursor)
+{
+	for (; cursor->height < VDO_BLOCK_MAP_TREE_HEIGHT; cursor->height++) {
+		height_t height = cursor->height;
+		struct cursor_level *level = &cursor->levels[height];
+		struct tree_page *tree_page =
+			&(cursor->tree->segments[0].levels[height][level->page_index]);
+		struct block_map_page *page = (struct block_map_page *) tree_page->page_buffer;
+
+		if (!page->header.initialized)
+			continue;
+
+		for (; level->slot < VDO_BLOCK_MAP_ENTRIES_PER_PAGE; level->slot++) {
+			struct cursor_level *next_level;
+			page_number_t entry_index =
+				(VDO_BLOCK_MAP_ENTRIES_PER_PAGE * level->page_index) + level->slot;
+			struct data_location location =
+				vdo_unpack_block_map_entry(&page->entries[level->slot]);
+
+			if (!vdo_is_valid_location(&location)) {
+				/* This entry is invalid, so remove it from the page. */
+				page->entries[level->slot] =
+					vdo_pack_block_map_entry(VDO_ZERO_BLOCK,
+								 VDO_MAPPING_STATE_UNMAPPED);
+				vdo_write_tree_page(tree_page, cursor->parent->zone);
+				continue;
+			}
+
+			if (!vdo_is_mapped_location(&location))
+				continue;
+
+			/* Erase mapped entries past the end of the logical space. */
+			if (entry_index >= cursor->boundary.levels[height]) {
+				page->entries[level->slot] =
+					vdo_pack_block_map_entry(VDO_ZERO_BLOCK,
+								 VDO_MAPPING_STATE_UNMAPPED);
+				vdo_write_tree_page(tree_page, cursor->parent->zone);
+				continue;
+			}
+
+			if (cursor->height < VDO_BLOCK_MAP_TREE_HEIGHT - 1) {
+				int result =
+					cursor->parent->entry_callback(location.pbn,
+								       cursor->parent->parent);
+
+				if (result != VDO_SUCCESS) {
+					page->entries[level->slot] =
+						vdo_pack_block_map_entry(VDO_ZERO_BLOCK,
+									 VDO_MAPPING_STATE_UNMAPPED);
+					vdo_write_tree_page(tree_page, cursor->parent->zone);
+					continue;
+				}
+			}
+
+			if (cursor->height == 0)
+				continue;
+
+			cursor->height--;
+			next_level = &cursor->levels[cursor->height];
+			next_level->page_index = entry_index;
+			next_level->slot = 0;
+			level->slot++;
+			submit_metadata_vio(&cursor->vio->vio,
+					    location.pbn,
+					    traversal_endio,
+					    continue_traversal,
+					    REQ_OP_READ | REQ_PRIO);
+			return;
+		}
+	}
+
+	finish_cursor(cursor);
+}
+
+/**
+ * launch_cursor() - Start traversing a single block map tree now that the cursor has a VIO with
+ *                   which to load pages.
+ * @context: The pooled_vio just acquired.
+ *
+ * Implements waiter_callback.
+ */
+static void launch_cursor(struct waiter *waiter, void *context)
+{
+	struct cursor *cursor = container_of(waiter, struct cursor, waiter);
+	struct pooled_vio *pooled = context;
+
+	cursor->vio = pooled;
+	pooled->vio.completion.parent = cursor;
+	pooled->vio.completion.callback_thread_id = cursor->parent->zone->thread_id;
+	traverse(cursor);
+}
+
+/**
+ * compute_boundary() - Compute the number of pages used at each level of the given root's tree.
+ *
+ * Return: The list of page counts as a boundary structure.
+ */
+static struct boundary compute_boundary(struct block_map *map, root_count_t root_index)
+{
+	struct boundary boundary;
+	height_t height;
+	page_count_t leaf_pages = vdo_compute_block_map_page_count(map->entry_count);
+	/*
+	 * Compute the leaf pages for this root. If the number of leaf pages does not distribute
+	 * evenly, we must determine if this root gets an extra page. Extra pages are assigned to
+	 * roots starting from tree 0.
+	 */
+	page_count_t last_tree_root = (leaf_pages - 1) % map->root_count;
+	page_count_t level_pages = leaf_pages / map->root_count;
+
+	if (root_index <= last_tree_root)
+		level_pages++;
+
+	for (height = 0; height < VDO_BLOCK_MAP_TREE_HEIGHT - 1; height++) {
+		boundary.levels[height] = level_pages;
+		level_pages = DIV_ROUND_UP(level_pages, VDO_BLOCK_MAP_ENTRIES_PER_PAGE);
+	}
+
+	/* The root node always exists, even if the root is otherwise unused. */
+	boundary.levels[VDO_BLOCK_MAP_TREE_HEIGHT - 1] = 1;
+
+	return boundary;
+}
+
+/**
+ * vdo_traverse_forest() - Walk the entire forest of a block map.
+ * @callback: A function to call with the pbn of each allocated node in the forest.
+ * @parent: The completion to notify on each traversed PBN, and when the traversal is complete.
+ */
+void vdo_traverse_forest(struct block_map *map,
+			 vdo_entry_callback *callback,
+			 struct vdo_completion *parent)
+{
+	root_count_t root;
+	struct cursors *cursors;
+	int result;
+
+	result = UDS_ALLOCATE_EXTENDED(struct cursors,
+				       map->root_count,
+				       struct cursor,
+				       __func__,
+				       &cursors);
+	if (result != VDO_SUCCESS) {
+		vdo_fail_completion(parent, result);
+		return;
+	}
+
+	cursors->zone = &map->zones[0];
+	cursors->pool = cursors->zone->vio_pool;
+	cursors->entry_callback = callback;
+	cursors->parent = parent;
+	cursors->active_roots = map->root_count;
+	for (root = 0; root < map->root_count; root++) {
+		struct cursor *cursor = &cursors->cursors[root];
+
+		*cursor = (struct cursor) {
+			.tree = &map->forest->trees[root],
+			.height = VDO_BLOCK_MAP_TREE_HEIGHT - 1,
+			.parent = cursors,
+			.boundary = compute_boundary(map, root),
+		};
+
+		cursor->waiter.callback = launch_cursor;
+		acquire_vio_from_pool(cursors->pool, &cursor->waiter);
+	};
+}
+
+/**
+ * initialize_block_map_zone() - Initialize the per-zone portions of the block map.
+ * @maximum_age: The number of journal blocks before a dirtied page is considered old and must be
+ *               written out.
+ */
+static int __must_check initialize_block_map_zone(struct block_map *map,
+						  zone_count_t zone_number,
+						  struct vdo *vdo,
+						  page_count_t cache_size,
+						  block_count_t maximum_age)
+{
+	int result;
+	block_count_t i;
+	struct block_map_zone *zone = &map->zones[zone_number];
+
+	STATIC_ASSERT_SIZEOF(struct page_descriptor, sizeof(u64));
+
+	zone->zone_number = zone_number;
+	zone->thread_id = vdo->thread_config.logical_threads[zone_number];
+	zone->block_map = map;
+
+	result = UDS_ALLOCATE_EXTENDED(struct dirty_lists,
+				       maximum_age,
+				       dirty_era_t,
+				       __func__,
+				       &zone->dirty_lists);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	zone->dirty_lists->maximum_age = maximum_age;
+	INIT_LIST_HEAD(&zone->dirty_lists->expired[VDO_TREE_PAGE]);
+	INIT_LIST_HEAD(&zone->dirty_lists->expired[VDO_CACHE_PAGE]);
+
+	for (i = 0; i < maximum_age; i++) {
+		INIT_LIST_HEAD(&zone->dirty_lists->eras[i][VDO_TREE_PAGE]);
+		INIT_LIST_HEAD(&zone->dirty_lists->eras[i][VDO_CACHE_PAGE]);
+	}
+
+	result = vdo_make_int_map(VDO_LOCK_MAP_CAPACITY, 0, &zone->loading_pages);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = make_vio_pool(vdo,
+			       BLOCK_MAP_VIO_POOL_SIZE,
+			       zone->thread_id,
+			       VIO_TYPE_BLOCK_MAP_INTERIOR,
+			       VIO_PRIORITY_METADATA,
+			       zone,
+			       &zone->vio_pool);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	vdo_set_admin_state_code(&zone->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+
+	zone->page_cache.zone = zone;
+	zone->page_cache.vdo = vdo;
+	zone->page_cache.page_count = cache_size / map->zone_count;
+	zone->page_cache.stats.free_pages = zone->page_cache.page_count;
+
+	result = allocate_cache_components(&zone->page_cache);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	/* initialize empty circular queues */
+	INIT_LIST_HEAD(&zone->page_cache.lru_list);
+	INIT_LIST_HEAD(&zone->page_cache.outgoing_list);
+
+	return VDO_SUCCESS;
+}
+
+/* Implements vdo_zone_thread_getter */
+static thread_id_t get_block_map_zone_thread_id(void *context, zone_count_t zone_number)
+{
+	struct block_map *map = context;
+
+	return map->zones[zone_number].thread_id;
+}
+
+/* Implements vdo_action_preamble */
+static void prepare_for_era_advance(void *context, struct vdo_completion *parent)
+{
+	struct block_map *map = context;
+
+	map->current_era_point = map->pending_era_point;
+	vdo_finish_completion(parent);
+}
+
+/* Implements vdo_zone_action */
+static void advance_block_map_zone_era(void *context,
+				       zone_count_t zone_number,
+				       struct vdo_completion *parent)
+{
+	struct block_map *map = context;
+	struct block_map_zone *zone = &map->zones[zone_number];
+
+	update_period(zone->dirty_lists, map->current_era_point);
+	write_expired_elements(zone);
+	vdo_finish_completion(parent);
+}
+
+/*
+ * Schedule an era advance if necessary. This method should not be called directly. Rather, call
+ * vdo_schedule_default_action() on the block map's action manager.
+ *
+ * Implements vdo_action_scheduler.
+ */
+static bool schedule_era_advance(void *context)
+{
+	struct block_map *map = context;
+
+	if (map->current_era_point == map->pending_era_point)
+		return false;
+
+	return vdo_schedule_action(map->action_manager,
+				   prepare_for_era_advance,
+				   advance_block_map_zone_era,
+				   NULL,
+				   NULL);
+}
+
+static void uninitialize_block_map_zone(struct block_map_zone *zone)
+{
+	struct vdo_page_cache *cache = &zone->page_cache;
+
+	UDS_FREE(UDS_FORGET(zone->dirty_lists));
+	free_vio_pool(UDS_FORGET(zone->vio_pool));
+	vdo_free_int_map(UDS_FORGET(zone->loading_pages));
+	if (cache->infos != NULL) {
+		struct page_info *info;
+
+		for (info = cache->infos; info < cache->infos + cache->page_count; ++info)
+			free_vio(UDS_FORGET(info->vio));
+	}
+
+	vdo_free_int_map(UDS_FORGET(cache->page_map));
+	UDS_FREE(UDS_FORGET(cache->infos));
+	UDS_FREE(UDS_FORGET(cache->pages));
+}
+
+void vdo_free_block_map(struct block_map *map)
+{
+	zone_count_t zone;
+
+	if (map == NULL)
+		return;
+
+	for (zone = 0; zone < map->zone_count; zone++)
+		uninitialize_block_map_zone(&map->zones[zone]);
+
+	vdo_abandon_block_map_growth(map);
+	if (map->forest != NULL)
+		deforest(UDS_FORGET(map->forest), 0);
+	UDS_FREE(UDS_FORGET(map->action_manager));
+	UDS_FREE(map);
+}
+
+/* @journal may be NULL. */
+int vdo_decode_block_map(struct block_map_state_2_0 state,
+			 block_count_t logical_blocks,
+			 struct vdo *vdo,
+			 struct recovery_journal *journal,
+			 nonce_t nonce,
+			 page_count_t cache_size,
+			 block_count_t maximum_age,
+			 struct block_map **map_ptr)
+{
+	struct block_map *map;
+	int result;
+	zone_count_t zone = 0;
+
+	STATIC_ASSERT(VDO_BLOCK_MAP_ENTRIES_PER_PAGE ==
+		      ((VDO_BLOCK_SIZE - sizeof(struct block_map_page)) /
+		       sizeof(struct block_map_entry)));
+	result = ASSERT(cache_size > 0, "block map cache size is specified");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = UDS_ALLOCATE_EXTENDED(struct block_map,
+				       vdo->thread_config.logical_zone_count,
+				       struct block_map_zone,
+				       __func__,
+				       &map);
+	if (result != UDS_SUCCESS)
+		return result;
+
+	map->vdo = vdo;
+	map->root_origin = state.root_origin;
+	map->root_count = state.root_count;
+	map->entry_count = logical_blocks;
+	map->journal = journal;
+	map->nonce = nonce;
+
+	result = make_forest(map, map->entry_count);
+	if (result != VDO_SUCCESS) {
+		vdo_free_block_map(map);
+		return result;
+	}
+
+	replace_forest(map);
+
+	map->zone_count = vdo->thread_config.logical_zone_count;
+	for (zone = 0; zone < map->zone_count; zone++) {
+		result = initialize_block_map_zone(map, zone, vdo, cache_size, maximum_age);
+		if (result != VDO_SUCCESS) {
+			vdo_free_block_map(map);
+			return result;
+		}
+	}
+
+	result = vdo_make_action_manager(map->zone_count,
+					 get_block_map_zone_thread_id,
+					 vdo_get_recovery_journal_thread_id(journal),
+					 map,
+					 schedule_era_advance,
+					 vdo,
+					 &map->action_manager);
+	if (result != VDO_SUCCESS) {
+		vdo_free_block_map(map);
+		return result;
+	}
+
+	*map_ptr = map;
+	return VDO_SUCCESS;
+}
+
+struct block_map_state_2_0 vdo_record_block_map(const struct block_map *map)
+{
+	return (struct block_map_state_2_0) {
+		.flat_page_origin = VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN,
+		/* This is the flat page count, which has turned out to always be 0. */
+		.flat_page_count = 0,
+		.root_origin = map->root_origin,
+		.root_count = map->root_count,
+	};
+}
+
+/* The block map needs to know the journals' sequence number to initialize the eras. */
+void vdo_initialize_block_map_from_journal(struct block_map *map, struct recovery_journal *journal)
+{
+	zone_count_t z = 0;
+
+	map->current_era_point = vdo_get_recovery_journal_current_sequence_number(journal);
+	map->pending_era_point = map->current_era_point;
+
+	for (z = 0; z < map->zone_count; z++) {
+		struct dirty_lists *dirty_lists = map->zones[z].dirty_lists;
+
+		ASSERT_LOG_ONLY(dirty_lists->next_period == 0, "current period not set");
+		dirty_lists->oldest_period = map->current_era_point;
+		dirty_lists->next_period = map->current_era_point + 1;
+		dirty_lists->offset = map->current_era_point % dirty_lists->maximum_age;
+	}
+}
+
+/* Compute the logical zone for the LBN of a data vio. */
+zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio)
+{
+	struct block_map *map = vdo_from_data_vio(data_vio)->block_map;
+	struct tree_lock *tree_lock = &data_vio->tree_lock;
+	page_number_t page_number = data_vio->logical.lbn / VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
+
+	tree_lock->tree_slots[0].page_index = page_number;
+	tree_lock->root_index = page_number % map->root_count;
+	return (tree_lock->root_index % map->zone_count);
+}
+
+/**
+ * vdo_find_block_map_slot() - Compute the block map slot in which the block map entry for a
+ *                             data_vio resides and cache that in the data_vio.
+ * @thread_id: The thread on which to run the callback.
+ * Update the block map era information for a newly finished journal block.
+ * This method must be called from the journal zone thread.
+ */
+void vdo_advance_block_map_era(struct block_map *map, sequence_number_t recovery_block_number)
+{
+	if (map == NULL)
+		return;
+
+	map->pending_era_point = recovery_block_number;
+	vdo_schedule_default_action(map->action_manager);
+}
+
+/* Implements vdo_admin_initiator */
+static void initiate_drain(struct admin_state *state)
+{
+	struct block_map_zone *zone = container_of(state, struct block_map_zone, state);
+
+	ASSERT_LOG_ONLY((zone->active_lookups == 0),
+			"%s() called with no active lookups",
+			__func__);
+
+	if (!vdo_is_state_suspending(state)) {
+		while (zone->dirty_lists->oldest_period < zone->dirty_lists->next_period)
+			expire_oldest_list(zone->dirty_lists);
+		write_expired_elements(zone);
+	}
+
+	check_for_drain_complete(zone);
+}
+
+/* Implements vdo_zone_action. */
+static void drain_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent)
+{
+	struct block_map *map = context;
+	struct block_map_zone *zone = &map->zones[zone_number];
+
+	vdo_start_draining(&zone->state,
+			   vdo_get_current_manager_operation(map->action_manager),
+			   parent,
+			   initiate_drain);
+}
+
+void vdo_drain_block_map(struct block_map *map,
+			 const struct admin_state_code *operation,
+			 struct vdo_completion *parent)
+{
+	vdo_schedule_operation(map->action_manager, operation, NULL, drain_zone, NULL, parent);
+}
+
+/* Implements vdo_zone_action. */
+static void
+resume_block_map_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent)
+{
+	struct block_map *map = context;
+	struct block_map_zone *zone = &map->zones[zone_number];
+
+	vdo_fail_completion(parent, vdo_resume_if_quiescent(&zone->state));
+}
+
+void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent)
+{
+	vdo_schedule_operation(map->action_manager,
+			       VDO_ADMIN_STATE_RESUMING,
+			       NULL,
+			       resume_block_map_zone,
+			       NULL,
+			       parent);
+}
+
+/* Allocate an expanded collection of trees, for a future growth. */
+int vdo_prepare_to_grow_block_map(struct block_map *map, block_count_t new_logical_blocks)
+{
+	if (map->next_entry_count == new_logical_blocks)
+		return VDO_SUCCESS;
+
+	if (map->next_entry_count > 0)
+		vdo_abandon_block_map_growth(map);
+
+	if (new_logical_blocks < map->entry_count) {
+		map->next_entry_count = map->entry_count;
+		return VDO_SUCCESS;
+	}
+
+	return make_forest(map, new_logical_blocks);
+}
+
+/* Implements vdo_action_preamble */
+static void grow_forest(void *context, struct vdo_completion *completion)
+{
+	replace_forest(context);
+	vdo_finish_completion(completion);
+}
+
+/* Requires vdo_prepare_to_grow_block_map() to have been previously called. */
+void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent)
+{
+	vdo_schedule_operation(map->action_manager,
+			       VDO_ADMIN_STATE_SUSPENDED_OPERATION,
+			       grow_forest,
+			       NULL,
+			       NULL,
+			       parent);
+}
+
+void vdo_abandon_block_map_growth(struct block_map *map)
+{
+	struct forest *forest = UDS_FORGET(map->next_forest);
+
+	if (forest != NULL)
+		deforest(forest, forest->segments - 1);
+
+	map->next_entry_count = 0;
+}
+
+/* Release the page completion and then continue the requester. */
+static inline void finish_processing_page(struct vdo_completion *completion, int result)
+{
+	struct vdo_completion *parent = completion->parent;
+
+	vdo_release_page_completion(completion);
+	vdo_continue_completion(parent, result);
+}
+
+static void handle_page_error(struct vdo_completion *completion)
+{
+	finish_processing_page(completion, completion->result);
+}
+
+/* Fetch the mapping page for a block map update, and call the provided handler when fetched. */
+static void fetch_mapping_page(struct data_vio *data_vio, bool modifiable, vdo_action *action)
+{
+	struct block_map_zone *zone = data_vio->logical.zone->block_map_zone;
+
+	if (vdo_is_state_draining(&zone->state)) {
+		continue_data_vio_with_error(data_vio, VDO_SHUTTING_DOWN);
+		return;
+	}
+
+	vdo_get_page(&data_vio->page_completion,
+		     zone,
+		     data_vio->tree_lock.tree_slots[0].block_map_slot.pbn,
+		     modifiable,
+		     &data_vio->vio.completion,
+		     action,
+		     handle_page_error,
+		     false);
+}
+
+/**
+ * clear_mapped_location() - Clear a data_vio's mapped block location, setting it to be unmapped.
+ *
+ * This indicates the block map entry for the logical block is either unmapped or corrupted.
+ */
+static void clear_mapped_location(struct data_vio *data_vio)
+{
+	data_vio->mapped = (struct zoned_pbn) {
+		.state = VDO_MAPPING_STATE_UNMAPPED,
+	};
+}
+
+/**
+ * set_mapped_location() - Decode and validate a block map entry, and set the mapped location of a
+ *                         data_vio.
+ *
+ * Return: VDO_SUCCESS or VDO_BAD_MAPPING if the map entry is invalid or an error code for any
+ *         other failure
+ */
+static int __must_check
+set_mapped_location(struct data_vio *data_vio, const struct block_map_entry *entry)
+{
+	/* Unpack the PBN for logging purposes even if the entry is invalid. */
+	struct data_location mapped = vdo_unpack_block_map_entry(entry);
+
+	if (vdo_is_valid_location(&mapped)) {
+		int result;
+
+		result = vdo_get_physical_zone(vdo_from_data_vio(data_vio),
+					       mapped.pbn,
+					       &data_vio->mapped.zone);
+		if (result == VDO_SUCCESS) {
+			data_vio->mapped.pbn = mapped.pbn;
+			data_vio->mapped.state = mapped.state;
+			return VDO_SUCCESS;
+		}
+
+		/*
+		 * Return all errors not specifically known to be errors from validating the
+		 * location.
+		 */
+		if ((result != VDO_OUT_OF_RANGE) && (result != VDO_BAD_MAPPING))
+			return result;
+	}
+
+	/*
+	 * Log the corruption even if we wind up ignoring it for write VIOs, converting all cases
+	 * to VDO_BAD_MAPPING.
+	 */
+	uds_log_error_strerror(VDO_BAD_MAPPING,
+			       "PBN %llu with state %u read from the block map was invalid",
+			       (unsigned long long) mapped.pbn,
+			       mapped.state);
+
+	/*
+	 * A read VIO has no option but to report the bad mapping--reading zeros would be hiding
+	 * known data loss.
+	 */
+	if (!data_vio->write)
+		return VDO_BAD_MAPPING;
+
+	/*
+	 * A write VIO only reads this mapping to decref the old block. Treat this as an unmapped
+	 * entry rather than fail the write.
+	 */
+	clear_mapped_location(data_vio);
+	return VDO_SUCCESS;
+}
+
+/* This callback is registered in vdo_get_mapped_block(). */
+static void get_mapping_from_fetched_page(struct vdo_completion *completion)
+{
+	int result;
+	struct vdo_page_completion *vpc = as_vdo_page_completion(completion);
+	const struct block_map_page *page;
+	const struct block_map_entry *entry;
+	struct data_vio *data_vio = as_data_vio(completion->parent);
+	struct block_map_tree_slot *tree_slot;
+
+	if (completion->result != VDO_SUCCESS) {
+		finish_processing_page(completion, completion->result);
+		return;
+	}
+
+	result = validate_completed_page(vpc, false);
+	if (result != VDO_SUCCESS) {
+		finish_processing_page(completion, result);
+		return;
+	}
+
+	page = (const struct block_map_page *) get_page_buffer(vpc->info);
+	tree_slot = &data_vio->tree_lock.tree_slots[0];
+	entry = &page->entries[tree_slot->block_map_slot.slot];
+
+	result = set_mapped_location(data_vio, entry);
+	finish_processing_page(completion, result);
+}
+
+void vdo_update_block_map_page(struct block_map_page *page,
+			       struct data_vio *data_vio,
+			       physical_block_number_t pbn,
+			       enum block_mapping_state mapping_state,
+			       sequence_number_t *recovery_lock)
+{
+	struct block_map_zone *zone = data_vio->logical.zone->block_map_zone;
+	struct block_map *block_map = zone->block_map;
+	struct recovery_journal *journal = block_map->journal;
+	sequence_number_t old_locked, new_locked;
+	struct tree_lock *tree_lock = &data_vio->tree_lock;
+
+	/* Encode the new mapping. */
+	page->entries[tree_lock->tree_slots[tree_lock->height].block_map_slot.slot] =
+		vdo_pack_block_map_entry(pbn, mapping_state);
+
+	/* Adjust references on the recovery journal blocks. */
+	old_locked = *recovery_lock;
+	new_locked = data_vio->recovery_sequence_number;
+
+	if ((old_locked == 0) || (old_locked > new_locked)) {
+		vdo_acquire_recovery_journal_block_reference(journal,
+							     new_locked,
+							     VDO_ZONE_TYPE_LOGICAL,
+							     zone->zone_number);
+
+		if (old_locked > 0)
+			vdo_release_recovery_journal_block_reference(journal,
+								     old_locked,
+								     VDO_ZONE_TYPE_LOGICAL,
+								     zone->zone_number);
+
+		*recovery_lock = new_locked;
+	}
+
+	/*
+	 * FIXME: explain this more
+	 * Release the transferred lock from the data_vio.
+	 */
+	vdo_release_journal_entry_lock(journal, new_locked);
+	data_vio->recovery_sequence_number = 0;
+}
+
+static void put_mapping_in_fetched_page(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion->parent);
+	sequence_number_t old_lock;
+	struct vdo_page_completion *vpc;
+	struct page_info *info;
+	int result;
+
+	if (completion->result != VDO_SUCCESS) {
+		finish_processing_page(completion, completion->result);
+		return;
+	}
+
+	vpc = as_vdo_page_completion(completion);
+	result = validate_completed_page(vpc, true);
+	if (result != VDO_SUCCESS) {
+		finish_processing_page(completion, result);
+		return;
+	}
+
+	info = vpc->info;
+	old_lock = info->recovery_lock;
+	vdo_update_block_map_page((struct block_map_page *) get_page_buffer(info),
+				  data_vio,
+				  data_vio->new_mapped.pbn,
+				  data_vio->new_mapped.state,
+				  &info->recovery_lock);
+	set_info_state(info, PS_DIRTY);
+	add_to_dirty_lists(info->cache->zone,
+			   &info->state_entry,
+			   VDO_CACHE_PAGE,
+			   old_lock,
+			   info->recovery_lock);
+	finish_processing_page(completion, VDO_SUCCESS);
+}
+
+/* Read a stored block mapping into a data_vio. */
+void vdo_get_mapped_block(struct data_vio *data_vio)
+{
+	if (data_vio->tree_lock.tree_slots[0].block_map_slot.pbn == VDO_ZERO_BLOCK) {
+		/*
+		 * We know that the block map page for this LBN has not been allocated, so the
+		 * block must be unmapped.
+		 */
+		clear_mapped_location(data_vio);
+		continue_data_vio(data_vio);
+		return;
+	}
+
+	fetch_mapping_page(data_vio, false, get_mapping_from_fetched_page);
+}
+
+/* Update a stored block mapping to reflect a data_vio's new mapping. */
+void vdo_put_mapped_block(struct data_vio *data_vio)
+{
+	fetch_mapping_page(data_vio, true, put_mapping_in_fetched_page);
+}
+
+struct block_map_statistics vdo_get_block_map_statistics(struct block_map *map)
+{
+	zone_count_t zone = 0;
+	struct block_map_statistics totals;
+
+	memset(&totals, 0, sizeof(struct block_map_statistics));
+	for (zone = 0; zone < map->zone_count; zone++) {
+		const struct block_map_statistics *stats = &(map->zones[zone].page_cache.stats);
+
+		totals.dirty_pages += READ_ONCE(stats->dirty_pages);
+		totals.clean_pages += READ_ONCE(stats->clean_pages);
+		totals.free_pages += READ_ONCE(stats->free_pages);
+		totals.failed_pages += READ_ONCE(stats->failed_pages);
+		totals.incoming_pages += READ_ONCE(stats->incoming_pages);
+		totals.outgoing_pages += READ_ONCE(stats->outgoing_pages);
+		totals.cache_pressure += READ_ONCE(stats->cache_pressure);
+		totals.read_count += READ_ONCE(stats->read_count);
+		totals.write_count += READ_ONCE(stats->write_count);
+		totals.failed_reads += READ_ONCE(stats->failed_reads);
+		totals.failed_writes += READ_ONCE(stats->failed_writes);
+		totals.reclaimed += READ_ONCE(stats->reclaimed);
+		totals.read_outgoing += READ_ONCE(stats->read_outgoing);
+		totals.found_in_cache += READ_ONCE(stats->found_in_cache);
+		totals.discard_required += READ_ONCE(stats->discard_required);
+		totals.wait_for_page += READ_ONCE(stats->wait_for_page);
+		totals.fetch_required += READ_ONCE(stats->fetch_required);
+		totals.pages_loaded += READ_ONCE(stats->pages_loaded);
+		totals.pages_saved += READ_ONCE(stats->pages_saved);
+		totals.flush_count += READ_ONCE(stats->flush_count);
+	}
+
+	return totals;
+}
diff --git a/drivers/md/dm-vdo/block-map.h b/drivers/md/dm-vdo/block-map.h
new file mode 100644
index 00000000000..e7d243ae5bb
--- /dev/null
+++ b/drivers/md/dm-vdo/block-map.h
@@ -0,0 +1,391 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_BLOCK_MAP_H
+#define VDO_BLOCK_MAP_H
+
+#include <linux/list.h>
+
+#include "numeric.h"
+
+#include "admin-state.h"
+#include "completion.h"
+#include "encodings.h"
+#include "int-map.h"
+#include "statistics.h"
+#include "types.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+enum {
+	BLOCK_MAP_VIO_POOL_SIZE = 64,
+};
+
+/* Used to indicate that the page holding the location of a tree root has been "loaded". */
+extern const physical_block_number_t VDO_INVALID_PBN;
+
+/*
+ * Generation counter for page references.
+ */
+typedef u32 vdo_page_generation;
+
+static const physical_block_number_t NO_PAGE = 0xFFFFFFFFFFFFFFFF;
+
+/* The VDO Page Cache abstraction. */
+struct vdo_page_cache {
+	/* the VDO which owns this cache */
+	struct vdo *vdo;
+	/* number of pages in cache */
+	page_count_t page_count;
+	/* number of pages to write in the current batch */
+	page_count_t pages_in_batch;
+	/* Whether the VDO is doing a read-only rebuild */
+	bool rebuilding;
+
+	/* array of page information entries */
+	struct page_info *infos;
+	/* raw memory for pages */
+	char *pages;
+	/* cache last found page info */
+	struct page_info *last_found;
+	/* map of page number to info */
+	struct int_map *page_map;
+	/* main LRU list (all infos) */
+	struct list_head lru_list;
+	/* free page list (oldest first) */
+	struct list_head free_list;
+	/* outgoing page list */
+	struct list_head outgoing_list;
+	/* number of read I/O operations pending */
+	page_count_t outstanding_reads;
+	/* number of write I/O operations pending */
+	page_count_t outstanding_writes;
+	/* number of pages covered by the current flush */
+	page_count_t pages_in_flush;
+	/* number of pages waiting to be included in the next flush */
+	page_count_t pages_to_flush;
+	/* number of discards in progress */
+	unsigned int discard_count;
+	/* how many VPCs waiting for free page */
+	unsigned int waiter_count;
+	/* queue of waiters who want a free page */
+	struct wait_queue free_waiters;
+	/*
+	 * Statistics are only updated on the logical zone thread, but are accessed from other
+	 * threads.
+	 */
+	struct block_map_statistics stats;
+	/* counter for pressure reports */
+	u32 pressure_report;
+	/* the block map zone to which this cache belongs */
+	struct block_map_zone *zone;
+};
+
+/*
+ * The state of a page buffer. If the page buffer is free no particular page is bound to it,
+ * otherwise the page buffer is bound to particular page whose absolute pbn is in the pbn field. If
+ * the page is resident or dirty the page data is stable and may be accessed. Otherwise the page is
+ * in flight (incoming or outgoing) and its data should not be accessed.
+ *
+ * @note Update the static data in get_page_state_name() if you change this enumeration.
+ */
+enum vdo_page_buffer_state {
+	/* this page buffer is not being used */
+	PS_FREE,
+	/* this page is being read from store */
+	PS_INCOMING,
+	/* attempt to load this page failed */
+	PS_FAILED,
+	/* this page is valid and un-modified */
+	PS_RESIDENT,
+	/* this page is valid and modified */
+	PS_DIRTY,
+	/* this page is being written and should not be used */
+	PS_OUTGOING,
+	/* not a state */
+	PAGE_STATE_COUNT,
+} __packed;
+
+/*
+ * The write status of page
+ */
+enum vdo_page_write_status {
+	WRITE_STATUS_NORMAL,
+	WRITE_STATUS_DISCARD,
+	WRITE_STATUS_DEFERRED,
+} __packed;
+
+/* Per-page-slot information. */
+struct page_info {
+	/* Preallocated page struct vio */
+	struct vio *vio;
+	/* back-link for references */
+	struct vdo_page_cache *cache;
+	/* the pbn of the page */
+	physical_block_number_t pbn;
+	/* page is busy (temporarily locked) */
+	u16 busy;
+	/* the write status the page */
+	enum vdo_page_write_status write_status;
+	/* page state */
+	enum vdo_page_buffer_state state;
+	/* queue of completions awaiting this item */
+	struct wait_queue waiting;
+	/* state linked list entry */
+	struct list_head state_entry;
+	/* LRU entry */
+	struct list_head lru_entry;
+	/*
+	 * The earliest recovery journal block containing uncommitted updates to the block map page
+	 * associated with this page_info. A reference (lock) is held on that block to prevent it
+	 * from being reaped. When this value changes, the reference on the old value must be
+	 * released and a reference on the new value must be acquired.
+	 */
+	sequence_number_t recovery_lock;
+};
+
+/*
+ * A completion awaiting a specific page. Also a live reference into the page once completed, until
+ * freed.
+ */
+struct vdo_page_completion {
+	/* The generic completion */
+	struct vdo_completion completion;
+	/* The cache involved */
+	struct vdo_page_cache *cache;
+	/* The waiter for the pending list */
+	struct waiter waiter;
+	/* The absolute physical block number of the page on disk */
+	physical_block_number_t pbn;
+	/* Whether the page may be modified */
+	bool writable;
+	/* Whether the page is available */
+	bool ready;
+	/* The info structure for the page, only valid when ready */
+	struct page_info *info;
+};
+
+struct forest;
+
+struct tree_page {
+	struct waiter waiter;
+
+	/* Dirty list entry */
+	struct list_head entry;
+
+	/* If dirty, the tree zone flush generation in which it was last dirtied. */
+	u8 generation;
+
+	/* Whether this page is an interior tree page being written out. */
+	bool writing;
+
+	/* If writing, the tree zone flush generation of the copy being written. */
+	u8 writing_generation;
+
+	/*
+	 * Sequence number of the earliest recovery journal block containing uncommitted updates to
+	 * this page
+	 */
+	sequence_number_t recovery_lock;
+
+	/* The value of recovery_lock when the this page last started writing */
+	sequence_number_t writing_recovery_lock;
+
+	char page_buffer[VDO_BLOCK_SIZE];
+};
+
+enum block_map_page_type {
+	VDO_TREE_PAGE,
+	VDO_CACHE_PAGE,
+};
+
+typedef struct list_head dirty_era_t[2];
+
+struct dirty_lists {
+	/** The number of periods after which an element will be expired */
+	block_count_t maximum_age;
+	/** The oldest period which has unexpired elements */
+	sequence_number_t oldest_period;
+	/** One more than the current period */
+	sequence_number_t next_period;
+	/** The offset in the array of lists of the oldest period */
+	block_count_t offset;
+	/** Expired pages */
+	dirty_era_t expired;
+	/** The lists of dirty pages */
+	dirty_era_t eras[];
+};
+
+struct block_map_zone {
+	zone_count_t zone_number;
+	thread_id_t thread_id;
+	struct admin_state state;
+	struct block_map *block_map;
+	/* Dirty pages, by era*/
+	struct dirty_lists *dirty_lists;
+	struct vdo_page_cache page_cache;
+	data_vio_count_t active_lookups;
+	struct int_map *loading_pages;
+	struct vio_pool *vio_pool;
+	/* The tree page which has issued or will be issuing a flush */
+	struct tree_page *flusher;
+	struct wait_queue flush_waiters;
+	/* The generation after the most recent flush */
+	u8 generation;
+	u8 oldest_generation;
+	/* The counts of dirty pages in each generation */
+	u32 dirty_page_counts[256];
+};
+
+struct block_map {
+	struct vdo *vdo;
+	struct action_manager *action_manager;
+	/* The absolute PBN of the first root of the tree part of the block map */
+	physical_block_number_t root_origin;
+	block_count_t root_count;
+
+	/* The era point we are currently distributing to the zones */
+	sequence_number_t current_era_point;
+	/* The next era point */
+	sequence_number_t pending_era_point;
+
+	/* The number of entries in block map */
+	block_count_t entry_count;
+	nonce_t nonce;
+	struct recovery_journal *journal;
+
+	/* The trees for finding block map pages */
+	struct forest *forest;
+	/* The expanded trees awaiting growth */
+	struct forest *next_forest;
+	/* The number of entries after growth */
+	block_count_t next_entry_count;
+
+	zone_count_t zone_count;
+	struct block_map_zone zones[];
+};
+
+/**
+ * typedef vdo_entry_callback - A function to be called for each allocated PBN when traversing the
+ *                              forest.
+ * @pbn: A PBN of a tree node.
+ * @completion: The parent completion of the traversal.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+typedef int vdo_entry_callback(physical_block_number_t pbn, struct vdo_completion *completion);
+
+static inline struct vdo_page_completion *as_vdo_page_completion(struct vdo_completion *completion)
+{
+	vdo_assert_completion_type(completion, VDO_PAGE_COMPLETION);
+	return container_of(completion, struct vdo_page_completion, completion);
+}
+
+void vdo_release_page_completion(struct vdo_completion *completion);
+
+void vdo_get_page(struct vdo_page_completion *page_completion,
+		  struct block_map_zone *zone,
+		  physical_block_number_t pbn,
+		  bool writable,
+		  void *parent,
+		  vdo_action *callback,
+		  vdo_action *error_handler,
+		  bool requeue);
+
+void vdo_request_page_write(struct vdo_completion *completion);
+
+int __must_check vdo_get_cached_page(struct vdo_completion *completion,
+				     struct block_map_page **page_ptr);
+
+int __must_check vdo_invalidate_page_cache(struct vdo_page_cache *cache);
+
+static inline struct block_map_page * __must_check
+vdo_as_block_map_page(struct tree_page *tree_page)
+{
+	return (struct block_map_page *) tree_page->page_buffer;
+}
+
+bool vdo_copy_valid_page(char *buffer, nonce_t nonce,
+			 physical_block_number_t pbn,
+			 struct block_map_page *page);
+
+void vdo_find_block_map_slot(struct data_vio *data_vio);
+
+physical_block_number_t
+vdo_find_block_map_page_pbn(struct block_map *map, page_number_t page_number);
+
+void vdo_write_tree_page(struct tree_page *page, struct block_map_zone *zone);
+
+void vdo_traverse_forest(struct block_map *map,
+			 vdo_entry_callback *callback,
+			 struct vdo_completion *parent);
+
+int __must_check vdo_decode_block_map(struct block_map_state_2_0 state,
+				      block_count_t logical_blocks,
+				      struct vdo *vdo,
+				      struct recovery_journal *journal,
+				      nonce_t nonce,
+				      page_count_t cache_size,
+				      block_count_t maximum_age,
+				      struct block_map **map_ptr);
+
+void vdo_drain_block_map(struct block_map *map,
+			 const struct admin_state_code *operation,
+			 struct vdo_completion *parent);
+
+void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent);
+
+int __must_check
+vdo_prepare_to_grow_block_map(struct block_map *map, block_count_t new_logical_blocks);
+
+void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent);
+
+void vdo_abandon_block_map_growth(struct block_map *map);
+
+void vdo_free_block_map(struct block_map *map);
+
+struct block_map_state_2_0 __must_check
+vdo_record_block_map(const struct block_map *map);
+
+void vdo_initialize_block_map_from_journal(struct block_map *map,
+					   struct recovery_journal *journal);
+
+zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio);
+
+void vdo_advance_block_map_era(struct block_map *map, sequence_number_t recovery_block_number);
+
+void vdo_update_block_map_page(struct block_map_page *page,
+			       struct data_vio *data_vio,
+			       physical_block_number_t pbn,
+			       enum block_mapping_state mapping_state,
+			       sequence_number_t *recovery_lock);
+
+void vdo_get_mapped_block(struct data_vio *data_vio);
+
+void vdo_put_mapped_block(struct data_vio *data_vio);
+
+struct block_map_statistics __must_check vdo_get_block_map_statistics(struct block_map *map);
+
+/**
+ * convert_maximum_age(): Convert the maximum age to reflect the new recovery journal format
+ * @age: The configured maximum age
+ *
+ * Return: The converted age
+ *
+ * In the old recovery journal format, each journal block held 311 entries, and every write bio
+ * made two entries. The old maximum age was half the usable journal length. In the new format,
+ * each block holds only 217 entries, but each bio only makes one entry. We convert the configured
+ * age so that the number of writes in a block map era is the same in the old and new formats. This
+ * keeps the bound on the amount of work required to recover the block map from the recovery
+ * journal the same across the format change. It also keeps the amortization of block map page
+ * writes to write bios the same.
+ */
+static inline block_count_t vdo_convert_maximum_age(block_count_t age)
+{
+	return DIV_ROUND_UP(age * RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK,
+			    2 * RECOVERY_JOURNAL_ENTRIES_PER_BLOCK);
+}
+
+#endif /* VDO_BLOCK_MAP_H */
diff --git a/drivers/md/dm-vdo/completion.c b/drivers/md/dm-vdo/completion.c
new file mode 100644
index 00000000000..8feb9c05c19
--- /dev/null
+++ b/drivers/md/dm-vdo/completion.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "completion.h"
+
+#include <linux/kernel.h>
+
+#include "logger.h"
+#include "permassert.h"
+
+#include "status-codes.h"
+#include "types.h"
+#include "vio.h"
+#include "vdo.h"
+
+/**
+ * DOC: vdo completions.
+ *
+ * Most of vdo's data structures are lock free, each either belonging to a single "zone," or
+ * divided into a number of zones whose accesses to the structure do not overlap. During normal
+ * operation, at most one thread will be operating in any given zone. Each zone has a
+ * vdo_work_queue which holds vdo_completions that are to be run in that zone. A completion may
+ * only be enqueued on one queue or operating in a single zone at a time.
+ *
+ * At each step of a multi-threaded operation, the completion performing the operation is given a
+ * callback, error handler, and thread id for the next step. A completion is "run" when it is
+ * operating on the correct thread (as specified by its callback_thread_id). If the value of its
+ * "result" field is an error (i.e. not VDO_SUCCESS), the function in its "error_handler" will be
+ * invoked. If the error_handler is NULL, or there is no error, the function set as its "callback"
+ * will be invoked. Generally, a completion will not be run directly, but rather will be
+ * "launched." In this case, it will check whether it is operating on the correct thread. If it is,
+ * it will run immediately. Otherwise, it will be enqueue on the vdo_work_queue associated with the
+ * completion's "callback_thread_id". When it is dequeued, it will be on the correct thread, and
+ * will get run. In some cases, the completion should get queued instead of running immediately,
+ * even if it is being launched from the correct thread. This is usually in cases where there is a
+ * long chain of callbacks, all on the same thread, which could overflow the stack. In such cases,
+ * the completion's "requeue" field should be set to true. Doing so will skip the current thread
+ * check and simply enqueue the completion.
+ *
+ * A completion may be "finished," in which case its "complete" field will be set to true before it
+ * is next run. It is a bug to attempt to set the result or re-finish a finished completion.
+ * Because a completion's fields are not safe to examine from any thread other than the one on
+ * which the completion is currently operating, this field is used only to aid in detecting
+ * programming errors. It can not be used for cross-thread checking on the status of an operation.
+ * A completion must be "reset" before it can be reused after it has been finished. Resetting will
+ * also clear any error from the result field.
+ **/
+
+void vdo_initialize_completion(struct vdo_completion *completion,
+			       struct vdo *vdo,
+			       enum vdo_completion_type type)
+{
+	memset(completion, 0, sizeof(*completion));
+	completion->vdo = vdo;
+	completion->type = type;
+	vdo_reset_completion(completion);
+}
+
+static inline void assert_incomplete(struct vdo_completion *completion)
+{
+	ASSERT_LOG_ONLY(!completion->complete, "completion is not complete");
+}
+
+/**
+ * vdo_set_completion_result() - Set the result of a completion.
+ *
+ * Older errors will not be masked.
+ */
+void vdo_set_completion_result(struct vdo_completion *completion, int result)
+{
+	assert_incomplete(completion);
+	if (completion->result == VDO_SUCCESS)
+		completion->result = result;
+}
+
+/**
+ * vdo_launch_completion_with_priority() - Run or enqueue a completion.
+ * @priority: The priority at which to enqueue the completion.
+ *
+ * If called on the correct thread (i.e. the one specified in the completion's callback_thread_id
+ * field) and not marked for requeue, the completion will be run immediately. Otherwise, the
+ * completion will be enqueued on the specified thread.
+ */
+void vdo_launch_completion_with_priority(struct vdo_completion *completion,
+					 enum vdo_completion_priority priority)
+{
+	thread_id_t callback_thread = completion->callback_thread_id;
+
+	if (completion->requeue || (callback_thread != vdo_get_callback_thread_id())) {
+		vdo_enqueue_completion(completion, priority);
+		return;
+	}
+
+	vdo_run_completion(completion);
+}
+
+/** vdo_finish_completion() - Mark a completion as complete and then launch it. */
+void vdo_finish_completion(struct vdo_completion *completion)
+{
+	assert_incomplete(completion);
+	completion->complete = true;
+	if (completion->callback != NULL)
+		vdo_launch_completion(completion);
+}
+
+void vdo_enqueue_completion(struct vdo_completion *completion,
+			    enum vdo_completion_priority priority)
+{
+	struct vdo *vdo = completion->vdo;
+	thread_id_t thread_id = completion->callback_thread_id;
+
+	if (ASSERT(thread_id < vdo->thread_config.thread_count,
+		   "thread_id %u (completion type %d) is less than thread count %u",
+		   thread_id,
+		   completion->type,
+		   vdo->thread_config.thread_count) != UDS_SUCCESS)
+		BUG();
+
+	completion->requeue = false;
+	completion->priority = priority;
+	completion->my_queue = NULL;
+	vdo_enqueue_work_queue(vdo->threads[thread_id].queue, completion);
+}
+
+/**
+ * vdo_requeue_completion_if_needed() - Requeue a completion if not called on the specified thread.
+ *
+ * Return: True if the completion was requeued; callers may not access the completion in this case.
+ */
+bool vdo_requeue_completion_if_needed(struct vdo_completion *completion,
+				      thread_id_t callback_thread_id)
+{
+	if (vdo_get_callback_thread_id() == callback_thread_id)
+		return false;
+
+	completion->callback_thread_id = callback_thread_id;
+	vdo_enqueue_completion(completion, VDO_WORK_Q_DEFAULT_PRIORITY);
+	return true;
+}
diff --git a/drivers/md/dm-vdo/completion.h b/drivers/md/dm-vdo/completion.h
new file mode 100644
index 00000000000..03b2daa6546
--- /dev/null
+++ b/drivers/md/dm-vdo/completion.h
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_COMPLETION_H
+#define VDO_COMPLETION_H
+
+#include "permassert.h"
+
+#include "status-codes.h"
+#include "types.h"
+
+/**
+ * vdo_run_completion() - Run a completion's callback or error handler on the current thread.
+ *
+ * Context: This function must be called from the correct callback thread.
+ */
+static inline void vdo_run_completion(struct vdo_completion *completion)
+{
+	if ((completion->result != VDO_SUCCESS) && (completion->error_handler != NULL)) {
+		completion->error_handler(completion);
+		return;
+	}
+
+	completion->callback(completion);
+}
+
+void vdo_set_completion_result(struct vdo_completion *completion, int result);
+
+void vdo_initialize_completion(struct vdo_completion *completion,
+			       struct vdo *vdo,
+			       enum vdo_completion_type type);
+
+/**
+ * vdo_reset_completion() - Reset a completion to a clean state, while keeping the type, vdo and
+ *                          parent information.
+ */
+static inline void vdo_reset_completion(struct vdo_completion *completion)
+{
+	completion->result = VDO_SUCCESS;
+	completion->complete = false;
+}
+
+void vdo_launch_completion_with_priority(struct vdo_completion *completion,
+					 enum vdo_completion_priority priority);
+
+/**
+ * vdo_launch_completion() - Launch a completion with default priority.
+ */
+static inline void vdo_launch_completion(struct vdo_completion *completion)
+{
+	vdo_launch_completion_with_priority(completion, VDO_WORK_Q_DEFAULT_PRIORITY);
+}
+
+/**
+ * vdo_continue_completion() - Continue processing a completion.
+ * @result: The current result (will not mask older errors).
+ *
+ * Continue processing a completion by setting the current result and calling
+ * vdo_launch_completion().
+ */
+static inline void vdo_continue_completion(struct vdo_completion *completion, int result)
+{
+	vdo_set_completion_result(completion, result);
+	vdo_launch_completion(completion);
+}
+
+void vdo_finish_completion(struct vdo_completion *completion);
+
+/**
+ * vdo_fail_completion() - Set the result of a completion if it does not already have an error,
+ *                         then finish it.
+ */
+static inline void vdo_fail_completion(struct vdo_completion *completion, int result)
+{
+	vdo_set_completion_result(completion, result);
+	vdo_finish_completion(completion);
+}
+
+/**
+ * vdo_assert_completion_type() - Assert that a completion is of the correct type.
+ *
+ * Return: VDO_SUCCESS or an error
+ */
+static inline int
+vdo_assert_completion_type(struct vdo_completion *completion, enum vdo_completion_type expected)
+{
+	return ASSERT(expected == completion->type,
+		      "completion type should be %u, not %u",
+		      expected,
+		      completion->type);
+}
+
+static inline void vdo_set_completion_callback(struct vdo_completion *completion,
+					       vdo_action *callback,
+					       thread_id_t callback_thread_id)
+{
+	completion->callback = callback;
+	completion->callback_thread_id = callback_thread_id;
+}
+
+/**
+ * vdo_launch_completion_callback() - Set the callback for a completion and launch it immediately.
+ */
+static inline void vdo_launch_completion_callback(struct vdo_completion *completion,
+						  vdo_action *callback,
+						  thread_id_t callback_thread_id)
+{
+	vdo_set_completion_callback(completion, callback, callback_thread_id);
+	vdo_launch_completion(completion);
+}
+
+/**
+ * vdo_prepare_completion() - Prepare a completion for launch.
+ *
+ * Resets the completion, and then sets its callback, error handler, callback thread, and parent.
+ */
+static inline void vdo_prepare_completion(struct vdo_completion *completion,
+					  vdo_action *callback,
+					  vdo_action *error_handler,
+					  thread_id_t callback_thread_id,
+					  void *parent)
+{
+	vdo_reset_completion(completion);
+	vdo_set_completion_callback(completion, callback, callback_thread_id);
+	completion->error_handler = error_handler;
+	completion->parent = parent;
+}
+
+/**
+ * vdo_prepare_completion_for_requeue() - Prepare a completion for launch ensuring that it will
+ *                                        always be requeued.
+ *
+ * Resets the completion, and then sets its callback, error handler, callback thread, and parent.
+ */
+static inline void
+vdo_prepare_completion_for_requeue(struct vdo_completion *completion,
+				   vdo_action *callback,
+				   vdo_action *error_handler,
+				   thread_id_t callback_thread_id,
+				   void *parent)
+{
+	vdo_prepare_completion(completion, callback, error_handler, callback_thread_id, parent);
+	completion->requeue = true;
+}
+
+void vdo_enqueue_completion(struct vdo_completion *completion,
+			    enum vdo_completion_priority priority);
+
+
+bool vdo_requeue_completion_if_needed(struct vdo_completion *completion,
+				      thread_id_t callback_thread_id);
+
+#endif /* VDO_COMPLETION_H */
diff --git a/drivers/md/dm-vdo/constants.c b/drivers/md/dm-vdo/constants.c
new file mode 100644
index 00000000000..8bdfb782b13
--- /dev/null
+++ b/drivers/md/dm-vdo/constants.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "types.h"
+
+/* The maximum logical space is 4 petabytes, which is 1 terablock. */
+const block_count_t MAXIMUM_VDO_LOGICAL_BLOCKS = 1024ULL * 1024 * 1024 * 1024;
+
+/* The maximum physical space is 256 terabytes, which is 64 gigablocks. */
+const block_count_t MAXIMUM_VDO_PHYSICAL_BLOCKS = 1024ULL * 1024 * 1024 * 64;
+
+/* unit test minimum */
+const block_count_t MINIMUM_VDO_SLAB_JOURNAL_BLOCKS = 2;
diff --git a/drivers/md/dm-vdo/constants.h b/drivers/md/dm-vdo/constants.h
new file mode 100644
index 00000000000..7196e99efe9
--- /dev/null
+++ b/drivers/md/dm-vdo/constants.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_CONSTANTS_H
+#define VDO_CONSTANTS_H
+
+#include <linux/blkdev.h>
+
+#include "types.h"
+
+enum {
+	/*
+	 * The maximum number of contiguous PBNs which will go to a single bio submission queue,
+	 * assuming there is more than one queue.
+	 */
+	VDO_BIO_ROTATION_INTERVAL_LIMIT = 1024,
+
+	/** The number of entries on a block map page */
+	VDO_BLOCK_MAP_ENTRIES_PER_PAGE = 812,
+
+	/** The origin of the flat portion of the block map */
+	VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN = 1,
+
+	/*
+	 * The height of a block map tree. Assuming a root count of 60 and 812 entries per page,
+	 * this is big enough to represent almost 95 PB of logical space.
+	 */
+	VDO_BLOCK_MAP_TREE_HEIGHT = 5,
+
+	/** The default number of bio submission queues. */
+	DEFAULT_VDO_BIO_SUBMIT_QUEUE_COUNT = 4,
+
+	/** The number of contiguous PBNs to be submitted to a single bio queue. */
+	DEFAULT_VDO_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL = 64,
+
+	/** The number of trees in the arboreal block map */
+	DEFAULT_VDO_BLOCK_MAP_TREE_ROOT_COUNT = 60,
+
+	/** The default size of the recovery journal, in blocks */
+	DEFAULT_VDO_RECOVERY_JOURNAL_SIZE = 32 * 1024,
+
+	/** The default size of each slab journal, in blocks */
+	DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224,
+
+	/*
+	 * The initial size of lbn_operations and pbn_operations, which is based upon the expected
+	 * maximum number of outstanding VIOs. This value was chosen to make it highly unlikely
+	 * that the maps would need to be resized.
+	 */
+	VDO_LOCK_MAP_CAPACITY = 10000,
+
+	/** The maximum number of logical zones */
+	MAX_VDO_LOGICAL_ZONES = 60,
+
+	/** The maximum number of physical zones */
+	MAX_VDO_PHYSICAL_ZONES = 16,
+
+	/** The base-2 logarithm of the maximum blocks in one slab */
+	MAX_VDO_SLAB_BITS = 23,
+
+	/** The maximum number of slabs the slab depot supports */
+	MAX_VDO_SLABS = 8192,
+
+	/*
+	 * The maximum number of block map pages to load simultaneously during recovery or rebuild.
+	 */
+	MAXIMUM_SIMULTANEOUS_VDO_BLOCK_MAP_RESTORATION_READS = 1024,
+
+	/** The maximum number of entries in the slab summary */
+	MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES = MAX_VDO_SLABS * MAX_VDO_PHYSICAL_ZONES,
+
+	/** The maximum number of total threads in a VDO thread configuration. */
+	MAXIMUM_VDO_THREADS = 100,
+
+	/** The maximum number of VIOs in the system at once */
+	MAXIMUM_VDO_USER_VIOS = 2048,
+
+	/** The only physical block size supported by VDO */
+	VDO_BLOCK_SIZE = 4096,
+
+	/** The number of sectors per block */
+	VDO_SECTORS_PER_BLOCK = (VDO_BLOCK_SIZE >> SECTOR_SHIFT),
+
+	/** The size of a sector that will not be torn */
+	VDO_SECTOR_SIZE = 512,
+
+	/** The physical block number reserved for storing the zero block */
+	VDO_ZERO_BLOCK = 0,
+};
+
+/** The maximum logical space is 4 petabytes, which is 1 terablock. */
+extern const block_count_t MAXIMUM_VDO_LOGICAL_BLOCKS;
+
+/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */
+extern const block_count_t MAXIMUM_VDO_PHYSICAL_BLOCKS;
+
+/** unit test minimum */
+extern const block_count_t MINIMUM_VDO_SLAB_JOURNAL_BLOCKS;
+
+#endif /* VDO_CONSTANTS_H */
diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c
new file mode 100644
index 00000000000..b8737e5cbc5
--- /dev/null
+++ b/drivers/md/dm-vdo/data-vio.c
@@ -0,0 +1,2070 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "data-vio.h"
+
+#include <linux/atomic.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/device-mapper.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/lz4.h>
+#include <linux/minmax.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "murmurhash3.h"
+#include "permassert.h"
+
+#include "block-map.h"
+#include "dump.h"
+#include "encodings.h"
+#include "int-map.h"
+#include "io-submitter.h"
+#include "logical-zone.h"
+#include "packer.h"
+#include "recovery-journal.h"
+#include "slab-depot.h"
+#include "status-codes.h"
+#include "types.h"
+#include "vdo.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+/**
+ * DOC: Bio flags.
+ *
+ * For certain flags set on user bios, if the user bio has not yet been acknowledged, setting those
+ * flags on our own bio(s) for that request may help underlying layers better fulfill the user
+ * bio's needs. This constant contains the aggregate of those flags; VDO strips all the other
+ * flags, as they convey incorrect information.
+ *
+ * These flags are always irrelevant if we have already finished the user bio as they are only
+ * hints on IO importance. If VDO has finished the user bio, any remaining IO done doesn't care how
+ * important finishing the finished bio was.
+ *
+ * Note that bio.c contains the complete list of flags we believe may be set; the following list
+ * explains the action taken with each of those flags VDO could receive:
+ *
+ * * REQ_SYNC: Passed down if the user bio is not yet completed, since it indicates the user bio
+ *   completion is required for further work to be done by the issuer.
+ * * REQ_META: Passed down if the user bio is not yet completed, since it may mean the lower layer
+ *   treats it as more urgent, similar to REQ_SYNC.
+ * * REQ_PRIO: Passed down if the user bio is not yet completed, since it indicates the user bio is
+ *   important.
+ * * REQ_NOMERGE: Set only if the incoming bio was split; irrelevant to VDO IO.
+ * * REQ_IDLE: Set if the incoming bio had more IO quickly following; VDO's IO pattern doesn't
+ *   match incoming IO, so this flag is incorrect for it.
+ * * REQ_FUA: Handled separately, and irrelevant to VDO IO otherwise.
+ * * REQ_RAHEAD: Passed down, as, for reads, it indicates trivial importance.
+ * * REQ_BACKGROUND: Not passed down, as VIOs are a limited resource and VDO needs them recycled
+ *   ASAP to service heavy load, which is the only place where REQ_BACKGROUND might aid in load
+ *   prioritization.
+ */
+static unsigned int PASSTHROUGH_FLAGS = (REQ_PRIO | REQ_META | REQ_SYNC | REQ_RAHEAD);
+
+/**
+ * DOC:
+ *
+ * The data_vio_pool maintains the pool of data_vios which a vdo uses to service incoming bios. For
+ * correctness, and in order to avoid potentially expensive or blocking memory allocations during
+ * normal operation, the number of concurrently active data_vios is capped. Furthermore, in order
+ * to avoid starvation of reads and writes, at most 75% of the data_vios may be used for
+ * discards. The data_vio_pool is responsible for enforcing these limits. Threads submitting bios
+ * for which a data_vio or discard permit are not available will block until the necessary
+ * resources are available. The pool is also responsible for distributing resources to blocked
+ * threads and waking them. Finally, the pool attempts to batch the work of recycling data_vios by
+ * performing the work of actually assigning resources to blocked threads or placing data_vios back
+ * into the pool on a single cpu at a time.
+ *
+ * The pool contains two "limiters", one for tracking data_vios and one for tracking discard
+ * permits. The limiters also provide safe cross-thread access to pool statistics without the need
+ * to take the pool's lock. When a thread submits a bio to a vdo device, it will first attempt to
+ * get a discard permit if it is a discard, and then to get a data_vio. If the necessary resources
+ * are available, the incoming bio will be assigned to the acquired data_vio, and it will be
+ * launched. However, if either of these are unavailable, the arrival time of the bio is recorded
+ * in the bio's bi_private field, the bio and its submitter are both queued on the appropriate
+ * limiter and the submitting thread will then put itself to sleep. (note that this mechanism will
+ * break if jiffies are only 32 bits.)
+ *
+ * Whenever a data_vio has completed processing for the bio it was servicing, release_data_vio()
+ * will be called on it. This function will add the data_vio to a funnel queue, and then check the
+ * state of the pool. If the pool is not currently processing released data_vios, the pool's
+ * completion will be enqueued on a cpu queue. This obviates the need for the releasing threads to
+ * hold the pool's lock, and also batches release work while avoiding starvation of the cpu
+ * threads.
+ *
+ * Whenever the pool's completion is run on a cpu thread, it calls process_release_callback() which
+ * processes a batch of returned data_vios (currently at most 32) from the pool's funnel queue. For
+ * each data_vio, it first checks whether that data_vio was processing a discard. If so, and there
+ * is a blocked bio waiting for a discard permit, that permit is notionally transferred to the
+ * eldest discard waiter, and that waiter is moved to the end of the list of discard bios waiting
+ * for a data_vio. If there are no discard waiters, the discard permit is returned to the pool.
+ * Next, the data_vio is assigned to the oldest blocked bio which either has a discard permit, or
+ * doesn't need one and relaunched. If neither of these exist, the data_vio is returned to the
+ * pool. Finally, if any waiting bios were launched, the threads which blocked trying to submit
+ * them are awakened.
+ */
+
+enum {
+	DATA_VIO_RELEASE_BATCH_SIZE = 128,
+};
+
+static const unsigned int VDO_SECTORS_PER_BLOCK_MASK = VDO_SECTORS_PER_BLOCK - 1;
+static const u32 COMPRESSION_STATUS_MASK = 0xff;
+static const u32 MAY_NOT_COMPRESS_MASK = 0x80000000;
+
+struct limiter;
+typedef void assigner(struct limiter *limiter);
+
+/* Bookkeeping structure for a single type of resource. */
+struct limiter {
+	/* The data_vio_pool to which this limiter belongs */
+	struct data_vio_pool *pool;
+	/* The maximum number of data_vios available */
+	data_vio_count_t limit;
+	/* The number of resources in use */
+	data_vio_count_t busy;
+	/* The maximum number of resources ever simultaneously in use */
+	data_vio_count_t max_busy;
+	/* The number of resources to release */
+	data_vio_count_t release_count;
+	/* The number of waiters to wake */
+	data_vio_count_t wake_count;
+	/* The list of waiting bios which are known to process_release_callback() */
+	struct bio_list waiters;
+	/* The list of waiting bios which are not yet known to process_release_callback() */
+	struct bio_list new_waiters;
+	/* The list of waiters which have their permits */
+	struct bio_list *permitted_waiters;
+	/* The function for assigning a resource to a waiter */
+	assigner *assigner;
+	/* The queue of blocked threads */
+	wait_queue_head_t blocked_threads;
+	/* The arrival time of the eldest waiter */
+	u64 arrival;
+};
+
+/*
+ * A data_vio_pool is a collection of preallocated data_vios which may be acquired from any thread,
+ * and are released in batches.
+ */
+struct data_vio_pool {
+	/* Completion for scheduling releases */
+	struct vdo_completion completion;
+	/* The administrative state of the pool */
+	struct admin_state state;
+	/* Lock protecting the pool */
+	spinlock_t lock;
+	/* The main limiter controlling the total data_vios in the pool. */
+	struct limiter limiter;
+	/* The limiter controlling data_vios for discard */
+	struct limiter discard_limiter;
+	/* The list of bios which have discard permits but still need a data_vio */
+	struct bio_list permitted_discards;
+	/* The list of available data_vios */
+	struct list_head available;
+	/* The queue of data_vios waiting to be returned to the pool */
+	struct funnel_queue *queue;
+	/* Whether the pool is processing, or scheduled to process releases */
+	atomic_t processing;
+	/* The data vios in the pool */
+	struct data_vio data_vios[];
+};
+
+static const char * const ASYNC_OPERATION_NAMES[] = {
+	"launch",
+	"acknowledge_write",
+	"acquire_hash_lock",
+	"attempt_logical_block_lock",
+	"lock_duplicate_pbn",
+	"check_for_duplication",
+	"cleanup",
+	"compress_data_vio",
+	"find_block_map_slot",
+	"get_mapped_block_for_read",
+	"get_mapped_block_for_write",
+	"hash_data_vio",
+	"journal_remapping",
+	"vdo_attempt_packing",
+	"put_mapped_block",
+	"read_data_vio",
+	"update_dedupe_index",
+	"update_reference_counts",
+	"verify_duplication",
+	"write_data_vio",
+};
+
+/* The steps taken cleaning up a VIO, in the order they are performed. */
+enum data_vio_cleanup_stage {
+	VIO_CLEANUP_START,
+	VIO_RELEASE_HASH_LOCK = VIO_CLEANUP_START,
+	VIO_RELEASE_ALLOCATED,
+	VIO_RELEASE_RECOVERY_LOCKS,
+	VIO_RELEASE_LOGICAL,
+	VIO_CLEANUP_DONE
+};
+
+static inline struct data_vio_pool * __must_check
+as_data_vio_pool(struct vdo_completion *completion)
+{
+	vdo_assert_completion_type(completion, VDO_DATA_VIO_POOL_COMPLETION);
+	return container_of(completion, struct data_vio_pool, completion);
+}
+
+static inline u64 get_arrival_time(struct bio *bio)
+{
+	return (u64) bio->bi_private;
+}
+
+/**
+ * check_for_drain_complete_locked() - Check whether a data_vio_pool has no outstanding data_vios
+ *				       or waiters while holding the pool's lock.
+ */
+static bool check_for_drain_complete_locked(struct data_vio_pool *pool)
+{
+	if (pool->limiter.busy > 0)
+		return false;
+
+	ASSERT_LOG_ONLY((pool->discard_limiter.busy == 0), "no outstanding discard permits");
+
+	return (bio_list_empty(&pool->limiter.new_waiters) &&
+		bio_list_empty(&pool->discard_limiter.new_waiters));
+}
+
+static void initialize_lbn_lock(struct data_vio *data_vio, logical_block_number_t lbn)
+{
+	struct vdo *vdo = vdo_from_data_vio(data_vio);
+	zone_count_t zone_number;
+	struct lbn_lock *lock = &data_vio->logical;
+
+	lock->lbn = lbn;
+	lock->locked = false;
+	vdo_initialize_wait_queue(&lock->waiters);
+	zone_number = vdo_compute_logical_zone(data_vio);
+	lock->zone = &vdo->logical_zones->zones[zone_number];
+}
+
+static void launch_locked_request(struct data_vio *data_vio)
+{
+	data_vio->logical.locked = true;
+	if (data_vio->write) {
+		struct vdo *vdo = vdo_from_data_vio(data_vio);
+
+		if (vdo_is_read_only(vdo)) {
+			continue_data_vio_with_error(data_vio, VDO_READ_ONLY);
+			return;
+		}
+	}
+
+	data_vio->last_async_operation = VIO_ASYNC_OP_FIND_BLOCK_MAP_SLOT;
+	vdo_find_block_map_slot(data_vio);
+}
+
+static void acknowledge_data_vio(struct data_vio *data_vio)
+{
+	struct vdo *vdo = vdo_from_data_vio(data_vio);
+	struct bio *bio = data_vio->user_bio;
+	int error = vdo_map_to_system_error(data_vio->vio.completion.result);
+
+	if (bio == NULL)
+		return;
+
+	ASSERT_LOG_ONLY((data_vio->remaining_discard <=
+			 (u32) (VDO_BLOCK_SIZE - data_vio->offset)),
+			"data_vio to acknowledge is not an incomplete discard");
+
+	data_vio->user_bio = NULL;
+	vdo_count_bios(&vdo->stats.bios_acknowledged, bio);
+	if (data_vio->is_partial)
+		vdo_count_bios(&vdo->stats.bios_acknowledged_partial, bio);
+
+	bio->bi_status = errno_to_blk_status(error);
+	bio_endio(bio);
+}
+
+static void copy_to_bio(struct bio *bio, char *data_ptr)
+{
+	struct bio_vec biovec;
+	struct bvec_iter iter;
+
+	bio_for_each_segment(biovec, bio, iter) {
+		memcpy_to_bvec(&biovec, data_ptr);
+		data_ptr += biovec.bv_len;
+	}
+}
+
+struct data_vio_compression_status
+get_data_vio_compression_status(struct data_vio *data_vio)
+{
+	u32 packed = atomic_read(&data_vio->compression.status);
+
+	/* pairs with cmpxchg in set_data_vio_compression_status */
+	smp_rmb();
+	return (struct data_vio_compression_status) {
+		.stage = packed & COMPRESSION_STATUS_MASK,
+		.may_not_compress = ((packed & MAY_NOT_COMPRESS_MASK) != 0),
+	};
+}
+
+/**
+ * pack_status() - Convert a data_vio_compression_status into a u32 which may be stored
+ *                 atomically.
+ * @status: The state to convert.
+ *
+ * Return: The compression state packed into a u32.
+ */
+static u32 __must_check pack_status(struct data_vio_compression_status status)
+{
+	return status.stage | (status.may_not_compress ? MAY_NOT_COMPRESS_MASK : 0);
+}
+
+/**
+ * set_data_vio_compression_status() - Set the compression status of a data_vio.
+ * @state: The expected current status of the data_vio.
+ * @new_state: The status to set.
+ *
+ * Return: true if the new status was set, false if the data_vio's compression status did not
+ *         match the expected state, and so was left unchanged.
+ */
+static bool __must_check
+set_data_vio_compression_status(struct data_vio *data_vio,
+				struct data_vio_compression_status status,
+				struct data_vio_compression_status new_status)
+{
+	u32 actual;
+	u32 expected = pack_status(status);
+	u32 replacement = pack_status(new_status);
+
+	/*
+	 * Extra barriers because this was original developed using a CAS operation that implicitly
+	 * had them.
+	 */
+	smp_mb__before_atomic();
+	actual = atomic_cmpxchg(&data_vio->compression.status, expected, replacement);
+	/* same as before_atomic */
+	smp_mb__after_atomic();
+	return (expected == actual);
+}
+
+struct data_vio_compression_status advance_data_vio_compression_stage(struct data_vio *data_vio)
+{
+	for (;;) {
+		struct data_vio_compression_status status =
+			get_data_vio_compression_status(data_vio);
+		struct data_vio_compression_status new_status = status;
+
+		if (status.stage == DATA_VIO_POST_PACKER)
+			/* We're already in the last stage. */
+			return status;
+
+		if (status.may_not_compress)
+			/*
+			 * Compression has been dis-allowed for this VIO, so skip the rest of the
+			 * path and go to the end.
+			 */
+			new_status.stage = DATA_VIO_POST_PACKER;
+		else
+			/* Go to the next state. */
+			new_status.stage++;
+
+		if (set_data_vio_compression_status(data_vio, status, new_status))
+			return new_status;
+
+		/* Another thread changed the status out from under us so try again. */
+	}
+}
+
+/**
+ * cancel_data_vio_compression() - Prevent this data_vio from being compressed or packed.
+ *
+ * Return: true if the data_vio is in the packer and the caller was the first caller to cancel it.
+ */
+bool cancel_data_vio_compression(struct data_vio *data_vio)
+{
+	struct data_vio_compression_status status, new_status;
+
+	for (;;) {
+		status = get_data_vio_compression_status(data_vio);
+		if (status.may_not_compress || (status.stage == DATA_VIO_POST_PACKER))
+			/* This data_vio is already set up to not block in the packer. */
+			break;
+
+		new_status.stage = status.stage;
+		new_status.may_not_compress = true;
+
+		if (set_data_vio_compression_status(data_vio, status, new_status))
+			break;
+	}
+
+	return ((status.stage == DATA_VIO_PACKING) && !status.may_not_compress);
+}
+
+/**
+ * attempt_logical_block_lock() - Attempt to acquire the lock on a logical block.
+ * @completion: The data_vio for an external data request as a completion.
+ *
+ * This is the start of the path for all external requests. It is registered in launch_data_vio().
+ */
+static void attempt_logical_block_lock(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+	struct lbn_lock *lock = &data_vio->logical;
+	struct vdo *vdo = vdo_from_data_vio(data_vio);
+	struct data_vio *lock_holder;
+	int result;
+
+	assert_data_vio_in_logical_zone(data_vio);
+
+	if (data_vio->logical.lbn >= vdo->states.vdo.config.logical_blocks) {
+		continue_data_vio_with_error(data_vio, VDO_OUT_OF_RANGE);
+		return;
+	}
+
+	result = vdo_int_map_put(lock->zone->lbn_operations,
+				 lock->lbn,
+				 data_vio,
+				 false,
+				 (void **) &lock_holder);
+	if (result != VDO_SUCCESS) {
+		continue_data_vio_with_error(data_vio, result);
+		return;
+	}
+
+	if (lock_holder == NULL) {
+		/* We got the lock */
+		launch_locked_request(data_vio);
+		return;
+	}
+
+	result = ASSERT(lock_holder->logical.locked, "logical block lock held");
+	if (result != VDO_SUCCESS) {
+		continue_data_vio_with_error(data_vio, result);
+		return;
+	}
+
+	/*
+	 * If the new request is a pure read request (not read-modify-write) and the lock_holder is
+	 * writing and has received an allocation (VDO-2683), service the read request immediately
+	 * by copying data from the lock_holder to avoid having to flush the write out of the
+	 * packer just to prevent the read from waiting indefinitely. If the lock_holder does not
+	 * yet have an allocation, prevent it from blocking in the packer and wait on it.
+	 */
+	if (!data_vio->write && READ_ONCE(lock_holder->allocation_succeeded)) {
+		copy_to_bio(data_vio->user_bio, (lock_holder->vio.data + data_vio->offset));
+		acknowledge_data_vio(data_vio);
+		complete_data_vio(completion);
+		return;
+	}
+
+	data_vio->last_async_operation = VIO_ASYNC_OP_ATTEMPT_LOGICAL_BLOCK_LOCK;
+	vdo_enqueue_waiter(&lock_holder->logical.waiters, &data_vio->waiter);
+
+	/*
+	 * Prevent writes and read-modify-writes from blocking indefinitely on lock holders in the
+	 * packer.
+	 */
+	if (lock_holder->write && cancel_data_vio_compression(lock_holder)) {
+		data_vio->compression.lock_holder = lock_holder;
+		launch_data_vio_packer_callback(data_vio, vdo_remove_lock_holder_from_packer);
+	}
+}
+
+/**
+ * launch_data_vio() - (Re)initialize a data_vio to have a new logical block number, keeping the
+ *		       same parent and other state and send it on its way.
+ */
+static void launch_data_vio(struct data_vio *data_vio, logical_block_number_t lbn)
+{
+	struct vdo_completion *completion = &data_vio->vio.completion;
+
+	/*
+	 * Clearing the tree lock must happen before initializing the LBN lock, which also adds
+	 * information to the tree lock.
+	 */
+	memset(&data_vio->tree_lock, 0, sizeof(data_vio->tree_lock));
+	initialize_lbn_lock(data_vio, lbn);
+	INIT_LIST_HEAD(&data_vio->hash_lock_entry);
+	INIT_LIST_HEAD(&data_vio->write_entry);
+
+	memset(&data_vio->allocation, 0, sizeof(data_vio->allocation));
+
+	data_vio->is_duplicate = false;
+
+	memset(&data_vio->record_name, 0, sizeof(data_vio->record_name));
+	memset(&data_vio->duplicate, 0, sizeof(data_vio->duplicate));
+	vdo_reset_completion(completion);
+	completion->error_handler = handle_data_vio_error;
+	set_data_vio_logical_callback(data_vio, attempt_logical_block_lock);
+	vdo_enqueue_completion(completion, VDO_DEFAULT_Q_MAP_BIO_PRIORITY);
+}
+
+static bool is_zero_block(char *block)
+{
+	int i;
+
+	for (i = 0; i < VDO_BLOCK_SIZE; i += sizeof(u64))
+		if (*((u64 *) &block[i]))
+			return false;
+	return true;
+}
+
+static void copy_from_bio(struct bio *bio, char *data_ptr)
+{
+	struct bio_vec biovec;
+	struct bvec_iter iter;
+
+	bio_for_each_segment(biovec, bio, iter) {
+		memcpy_from_bvec(data_ptr, &biovec);
+		data_ptr += biovec.bv_len;
+	}
+}
+
+static void launch_bio(struct vdo *vdo, struct data_vio *data_vio, struct bio *bio)
+{
+	logical_block_number_t lbn;
+	/*
+	 * Zero out the fields which don't need to be preserved (i.e. which are not pointers to
+	 * separately allocated objects).
+	 */
+	memset(data_vio, 0, offsetof(struct data_vio, vio));
+	memset(&data_vio->compression, 0, offsetof(struct compression_state, block));
+
+	data_vio->user_bio = bio;
+	data_vio->offset = to_bytes(bio->bi_iter.bi_sector & VDO_SECTORS_PER_BLOCK_MASK);
+	data_vio->is_partial = (bio->bi_iter.bi_size < VDO_BLOCK_SIZE) || (data_vio->offset != 0);
+
+	/*
+	 * Discards behave very differently than other requests when coming in from device-mapper.
+	 * We have to be able to handle any size discards and various sector offsets within a
+	 * block.
+	 */
+	if (bio_op(bio) == REQ_OP_DISCARD) {
+		data_vio->remaining_discard = bio->bi_iter.bi_size;
+		data_vio->write = true;
+		data_vio->is_trim = true;
+		if (data_vio->is_partial) {
+			vdo_count_bios(&vdo->stats.bios_in_partial, bio);
+			data_vio->read = true;
+		}
+	} else if (data_vio->is_partial) {
+		vdo_count_bios(&vdo->stats.bios_in_partial, bio);
+		data_vio->read = true;
+		if (bio_data_dir(bio) == WRITE)
+			data_vio->write = true;
+	} else if (bio_data_dir(bio) == READ) {
+		data_vio->read = true;
+	} else {
+		/*
+		 * Copy the bio data to a char array so that we can continue to use the data after
+		 * we acknowledge the bio.
+		 */
+		copy_from_bio(bio, data_vio->vio.data);
+		data_vio->is_zero = is_zero_block(data_vio->vio.data);
+		data_vio->write = true;
+	}
+
+	if (data_vio->user_bio->bi_opf & REQ_FUA)
+		data_vio->fua = true;
+
+	lbn = (bio->bi_iter.bi_sector - vdo->starting_sector_offset) / VDO_SECTORS_PER_BLOCK;
+	launch_data_vio(data_vio, lbn);
+}
+
+static void assign_data_vio(struct limiter *limiter, struct data_vio *data_vio)
+{
+	struct bio *bio = bio_list_pop(limiter->permitted_waiters);
+
+	launch_bio(limiter->pool->completion.vdo, data_vio, bio);
+	limiter->wake_count++;
+
+	bio = bio_list_peek(limiter->permitted_waiters);
+	limiter->arrival = ((bio == NULL) ? U64_MAX : get_arrival_time(bio));
+}
+
+static void assign_discard_permit(struct limiter *limiter)
+{
+	struct bio *bio = bio_list_pop(&limiter->waiters);
+
+	if (limiter->arrival == U64_MAX)
+		limiter->arrival = get_arrival_time(bio);
+
+	bio_list_add(limiter->permitted_waiters, bio);
+}
+
+static void get_waiters(struct limiter *limiter)
+{
+	bio_list_merge(&limiter->waiters, &limiter->new_waiters);
+	bio_list_init(&limiter->new_waiters);
+}
+
+static inline
+struct data_vio *get_available_data_vio(struct data_vio_pool *pool)
+{
+	struct data_vio *data_vio =
+		list_first_entry(&pool->available, struct data_vio, pool_entry);
+
+	list_del_init(&data_vio->pool_entry);
+	return data_vio;
+}
+
+static void assign_data_vio_to_waiter(struct limiter *limiter)
+{
+	assign_data_vio(limiter, get_available_data_vio(limiter->pool));
+}
+
+static void update_limiter(struct limiter *limiter)
+{
+	struct bio_list *waiters = &limiter->waiters;
+	data_vio_count_t available = limiter->limit - limiter->busy;
+
+	ASSERT_LOG_ONLY((limiter->release_count <= limiter->busy),
+			"Release count %u is not more than busy count %u",
+			limiter->release_count,
+			limiter->busy);
+
+	get_waiters(limiter);
+	for (; (limiter->release_count > 0) && !bio_list_empty(waiters); limiter->release_count--)
+		limiter->assigner(limiter);
+
+	if (limiter->release_count > 0) {
+		WRITE_ONCE(limiter->busy, limiter->busy - limiter->release_count);
+		limiter->release_count = 0;
+		return;
+	}
+
+	for (; (available > 0) && !bio_list_empty(waiters); available--)
+		limiter->assigner(limiter);
+
+	WRITE_ONCE(limiter->busy, limiter->limit - available);
+	if (limiter->max_busy < limiter->busy)
+		WRITE_ONCE(limiter->max_busy, limiter->busy);
+}
+
+/**
+ * schedule_releases() - Ensure that release processing is scheduled.
+ *
+ * If this call switches the state to processing, enqueue. Otherwise, some other thread has already
+ * done so.
+ */
+static void schedule_releases(struct data_vio_pool *pool)
+{
+	/* Pairs with the barrier in process_release_callback(). */
+	smp_mb__before_atomic();
+	if (atomic_cmpxchg(&pool->processing, false, true))
+		return;
+
+	pool->completion.requeue = true;
+	vdo_launch_completion_with_priority(&pool->completion, CPU_Q_COMPLETE_VIO_PRIORITY);
+}
+
+static void reuse_or_release_resources(struct data_vio_pool *pool,
+				       struct data_vio *data_vio,
+				       struct list_head *returned)
+{
+	if (data_vio->remaining_discard > 0) {
+		if (bio_list_empty(&pool->discard_limiter.waiters))
+			/* Return the data_vio's discard permit. */
+			pool->discard_limiter.release_count++;
+		else
+			assign_discard_permit(&pool->discard_limiter);
+	}
+
+	if (pool->limiter.arrival < pool->discard_limiter.arrival) {
+		assign_data_vio(&pool->limiter, data_vio);
+	} else if (pool->discard_limiter.arrival < U64_MAX) {
+		assign_data_vio(&pool->discard_limiter, data_vio);
+	} else {
+		list_add(&data_vio->pool_entry, returned);
+		pool->limiter.release_count++;
+	}
+}
+
+/**
+ * process_release_callback() - Process a batch of data_vio releases.
+ * @completion: The pool with data_vios to release.
+ */
+static void process_release_callback(struct vdo_completion *completion)
+{
+	struct data_vio_pool *pool = as_data_vio_pool(completion);
+	bool reschedule;
+	bool drained;
+	data_vio_count_t processed;
+	data_vio_count_t to_wake;
+	data_vio_count_t discards_to_wake;
+	LIST_HEAD(returned);
+
+	spin_lock(&pool->lock);
+	get_waiters(&pool->discard_limiter);
+	get_waiters(&pool->limiter);
+	spin_unlock(&pool->lock);
+
+	if (pool->limiter.arrival == U64_MAX) {
+		struct bio *bio = bio_list_peek(&pool->limiter.waiters);
+
+		if (bio != NULL)
+			pool->limiter.arrival = get_arrival_time(bio);
+	}
+
+	for (processed = 0; processed < DATA_VIO_RELEASE_BATCH_SIZE; processed++) {
+		struct data_vio *data_vio;
+		struct funnel_queue_entry *entry = uds_funnel_queue_poll(pool->queue);
+
+		if (entry == NULL)
+			break;
+
+		data_vio = as_data_vio(container_of(entry,
+						    struct vdo_completion,
+						    work_queue_entry_link));
+		acknowledge_data_vio(data_vio);
+		reuse_or_release_resources(pool, data_vio, &returned);
+	}
+
+	spin_lock(&pool->lock);
+	/*
+	 * There is a race where waiters could be added while we are in the unlocked section above.
+	 * Those waiters could not see the resources we are now about to release, so we assign
+	 * those resources now as we have no guarantee of being rescheduled. This is handled in
+	 * update_limiter().
+	 */
+	update_limiter(&pool->discard_limiter);
+	list_splice(&returned, &pool->available);
+	update_limiter(&pool->limiter);
+	to_wake = pool->limiter.wake_count;
+	pool->limiter.wake_count = 0;
+	discards_to_wake = pool->discard_limiter.wake_count;
+	pool->discard_limiter.wake_count = 0;
+
+	atomic_set(&pool->processing, false);
+	/* Pairs with the barrier in schedule_releases(). */
+	smp_mb();
+
+	reschedule = !uds_is_funnel_queue_empty(pool->queue);
+	drained = (!reschedule &&
+		   vdo_is_state_draining(&pool->state) &&
+		   check_for_drain_complete_locked(pool));
+	spin_unlock(&pool->lock);
+
+	if (to_wake > 0)
+		wake_up_nr(&pool->limiter.blocked_threads, to_wake);
+
+	if (discards_to_wake > 0)
+		wake_up_nr(&pool->discard_limiter.blocked_threads,
+			   discards_to_wake);
+
+	if (reschedule)
+		schedule_releases(pool);
+	else if (drained)
+		vdo_finish_draining(&pool->state);
+}
+
+static void initialize_limiter(struct limiter *limiter,
+			       struct data_vio_pool *pool,
+			       assigner *assigner,
+			       data_vio_count_t limit)
+{
+	limiter->pool = pool;
+	limiter->assigner = assigner;
+	limiter->limit = limit;
+	limiter->arrival = U64_MAX;
+	init_waitqueue_head(&limiter->blocked_threads);
+}
+
+/**
+ * initialize_data_vio() - Allocate the components of a data_vio.
+ *
+ * The caller is responsible for cleaning up the data_vio on error.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int initialize_data_vio(struct data_vio *data_vio, struct vdo *vdo)
+{
+	struct bio *bio;
+	int result;
+
+	STATIC_ASSERT(VDO_BLOCK_SIZE <= PAGE_SIZE);
+	result = uds_allocate_memory(VDO_BLOCK_SIZE, 0, "data_vio data", &data_vio->vio.data);
+	if (result != VDO_SUCCESS)
+		return uds_log_error_strerror(result, "data_vio data allocation failure");
+
+	result = uds_allocate_memory(VDO_BLOCK_SIZE,
+				     0,
+				     "compressed block",
+				     &data_vio->compression.block);
+	if (result != VDO_SUCCESS)
+		return uds_log_error_strerror(result,
+					      "data_vio compressed block allocation failure");
+
+	result = uds_allocate_memory(VDO_BLOCK_SIZE, 0, "vio scratch", &data_vio->scratch_block);
+	if (result != VDO_SUCCESS)
+		return uds_log_error_strerror(result, "data_vio scratch allocation failure");
+
+	result = vdo_create_bio(&bio);
+	if (result != VDO_SUCCESS)
+		return uds_log_error_strerror(result, "data_vio data bio allocation failure");
+
+	vdo_initialize_completion(&data_vio->decrement_completion, vdo, VDO_DECREMENT_COMPLETION);
+	initialize_vio(&data_vio->vio, bio, 1, VIO_TYPE_DATA, VIO_PRIORITY_DATA, vdo);
+
+	return VDO_SUCCESS;
+}
+
+static void destroy_data_vio(struct data_vio *data_vio)
+{
+	if (data_vio == NULL)
+		return;
+
+	vdo_free_bio(UDS_FORGET(data_vio->vio.bio));
+	UDS_FREE(UDS_FORGET(data_vio->vio.data));
+	UDS_FREE(UDS_FORGET(data_vio->compression.block));
+	UDS_FREE(UDS_FORGET(data_vio->scratch_block));
+}
+
+/**
+ * make_data_vio_pool() - Initialize a data_vio pool.
+ * @vdo: The vdo to which the pool will belong.
+ * @pool_size: The number of data_vios in the pool.
+ * @discard_limit: The maximum number of data_vios which may be used for discards.
+ * @pool: A pointer to hold the newly allocated pool.
+ */
+int make_data_vio_pool(struct vdo *vdo,
+		       data_vio_count_t pool_size,
+		       data_vio_count_t discard_limit,
+		       struct data_vio_pool **pool_ptr)
+{
+	int result;
+	struct data_vio_pool *pool;
+	data_vio_count_t i;
+
+	result = UDS_ALLOCATE_EXTENDED(struct data_vio_pool,
+				       pool_size,
+				       struct data_vio,
+				       __func__,
+				       &pool);
+	if (result != UDS_SUCCESS)
+		return result;
+
+	ASSERT_LOG_ONLY((discard_limit <= pool_size), "discard limit does not exceed pool size");
+	initialize_limiter(&pool->discard_limiter, pool, assign_discard_permit, discard_limit);
+	pool->discard_limiter.permitted_waiters = &pool->permitted_discards;
+	initialize_limiter(&pool->limiter, pool, assign_data_vio_to_waiter, pool_size);
+	pool->limiter.permitted_waiters = &pool->limiter.waiters;
+	INIT_LIST_HEAD(&pool->available);
+	spin_lock_init(&pool->lock);
+	vdo_set_admin_state_code(&pool->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+	vdo_initialize_completion(&pool->completion, vdo, VDO_DATA_VIO_POOL_COMPLETION);
+	vdo_prepare_completion(&pool->completion,
+			       process_release_callback,
+			       process_release_callback,
+			       vdo->thread_config.cpu_thread,
+			       NULL);
+
+	result = uds_make_funnel_queue(&pool->queue);
+	if (result != UDS_SUCCESS) {
+		free_data_vio_pool(UDS_FORGET(pool));
+		return result;
+	}
+
+	for (i = 0; i < pool_size; i++) {
+		struct data_vio *data_vio = &pool->data_vios[i];
+
+		result = initialize_data_vio(data_vio, vdo);
+		if (result != VDO_SUCCESS) {
+			destroy_data_vio(data_vio);
+			free_data_vio_pool(pool);
+			return result;
+		}
+
+		list_add(&data_vio->pool_entry, &pool->available);
+	}
+
+	*pool_ptr = pool;
+	return VDO_SUCCESS;
+}
+
+/**
+ * free_data_vio_pool() - Free a data_vio_pool and the data_vios in it.
+ *
+ * All data_vios must be returned to the pool before calling this function.
+ */
+void free_data_vio_pool(struct data_vio_pool *pool)
+{
+	struct data_vio *data_vio, *tmp;
+
+	if (pool == NULL)
+		return;
+
+	/*
+	 * Pairs with the barrier in process_release_callback(). Possibly not needed since it
+	 * caters to an enqueue vs. free race.
+	 */
+	smp_mb();
+	BUG_ON(atomic_read(&pool->processing));
+
+	spin_lock(&pool->lock);
+	ASSERT_LOG_ONLY((pool->limiter.busy == 0),
+			"data_vio pool must not have %u busy entries when being freed",
+			pool->limiter.busy);
+	ASSERT_LOG_ONLY((bio_list_empty(&pool->limiter.waiters) &&
+			 bio_list_empty(&pool->limiter.new_waiters)),
+			"data_vio pool must not have threads waiting to read or write when being freed");
+	ASSERT_LOG_ONLY((bio_list_empty(&pool->discard_limiter.waiters) &&
+			 bio_list_empty(&pool->discard_limiter.new_waiters)),
+			"data_vio pool must not have threads waiting to discard when being freed");
+	spin_unlock(&pool->lock);
+
+	list_for_each_entry_safe(data_vio, tmp, &pool->available, pool_entry) {
+		list_del_init(&data_vio->pool_entry);
+		destroy_data_vio(data_vio);
+	}
+
+	uds_free_funnel_queue(UDS_FORGET(pool->queue));
+	UDS_FREE(pool);
+}
+
+static bool acquire_permit(struct limiter *limiter, struct bio *bio)
+{
+	if (limiter->busy >= limiter->limit) {
+		DEFINE_WAIT(wait);
+
+		bio_list_add(&limiter->new_waiters, bio);
+		prepare_to_wait_exclusive(&limiter->blocked_threads, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&limiter->pool->lock);
+		io_schedule();
+		finish_wait(&limiter->blocked_threads, &wait);
+		return false;
+	}
+
+	WRITE_ONCE(limiter->busy, limiter->busy + 1);
+	if (limiter->max_busy < limiter->busy)
+		WRITE_ONCE(limiter->max_busy, limiter->busy);
+
+	return true;
+}
+
+/**
+ * vdo_launch_bio() - Acquire a data_vio from the pool, assign the bio to it, and launch it.
+ *
+ * This will block if data_vios or discard permits are not available.
+ */
+void vdo_launch_bio(struct data_vio_pool *pool, struct bio *bio)
+{
+	struct data_vio *data_vio;
+
+	ASSERT_LOG_ONLY(!vdo_is_state_quiescent(&pool->state),
+			"data_vio_pool not quiescent on acquire");
+
+	bio->bi_private = (void *) jiffies;
+	spin_lock(&pool->lock);
+	if ((bio_op(bio) == REQ_OP_DISCARD) && !acquire_permit(&pool->discard_limiter, bio))
+		return;
+
+	if (!acquire_permit(&pool->limiter, bio))
+		return;
+
+	data_vio = get_available_data_vio(pool);
+	spin_unlock(&pool->lock);
+	launch_bio(pool->completion.vdo, data_vio, bio);
+}
+
+/* Implements vdo_admin_initiator. */
+static void initiate_drain(struct admin_state *state)
+{
+	bool drained;
+	struct data_vio_pool *pool = container_of(state, struct data_vio_pool, state);
+
+	spin_lock(&pool->lock);
+	drained = check_for_drain_complete_locked(pool);
+	spin_unlock(&pool->lock);
+
+	if (drained)
+		vdo_finish_draining(state);
+}
+
+static void assert_on_vdo_cpu_thread(const struct vdo *vdo, const char *name)
+{
+	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == vdo->thread_config.cpu_thread),
+			"%s called on cpu thread",
+			name);
+}
+
+/**
+ * drain_data_vio_pool() - Wait asynchronously for all data_vios to be returned to the pool.
+ * @completion: The completion to notify when the pool has drained.
+ */
+void drain_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *completion)
+{
+	assert_on_vdo_cpu_thread(completion->vdo, __func__);
+	vdo_start_draining(&pool->state, VDO_ADMIN_STATE_SUSPENDING, completion, initiate_drain);
+}
+
+/**
+ * resume_data_vio_pool() - Resume a data_vio pool.
+ * @completion: The completion to notify when the pool has resumed.
+ */
+void resume_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *completion)
+{
+	assert_on_vdo_cpu_thread(completion->vdo, __func__);
+	vdo_continue_completion(completion, vdo_resume_if_quiescent(&pool->state));
+}
+
+static void dump_limiter(const char *name, struct limiter *limiter)
+{
+	uds_log_info("%s: %u of %u busy (max %u), %s",
+		     name,
+		     limiter->busy,
+		     limiter->limit,
+		     limiter->max_busy,
+		     ((bio_list_empty(&limiter->waiters) &&
+		       bio_list_empty(&limiter->new_waiters)) ? "no waiters" : "has waiters"));
+}
+
+/**
+ * dump_data_vio_pool() - Dump a data_vio pool to the log.
+ * @dump_vios: Whether to dump the details of each busy data_vio as well.
+ */
+void dump_data_vio_pool(struct data_vio_pool *pool, bool dump_vios)
+{
+	/*
+	 * In order that syslog can empty its buffer, sleep after 35 elements for 4ms (till the
+	 * second clock tick).  These numbers were picked based on experiments with lab machines.
+	 */
+	enum { ELEMENTS_PER_BATCH = 35 };
+	enum { SLEEP_FOR_SYSLOG = 4000 };
+
+	if (pool == NULL)
+		return;
+
+	spin_lock(&pool->lock);
+	dump_limiter("data_vios", &pool->limiter);
+	dump_limiter("discard permits", &pool->discard_limiter);
+	if (dump_vios) {
+		int i;
+		int dumped = 0;
+
+		for (i = 0; i < pool->limiter.limit; i++) {
+			struct data_vio *data_vio = &pool->data_vios[i];
+
+			if (!list_empty(&data_vio->pool_entry))
+				continue;
+
+			dump_data_vio(data_vio);
+			if (++dumped >= ELEMENTS_PER_BATCH) {
+				spin_unlock(&pool->lock);
+				dumped = 0;
+				fsleep(SLEEP_FOR_SYSLOG);
+				spin_lock(&pool->lock);
+			}
+		}
+	}
+
+	spin_unlock(&pool->lock);
+}
+
+data_vio_count_t get_data_vio_pool_active_discards(struct data_vio_pool *pool)
+{
+	return READ_ONCE(pool->discard_limiter.busy);
+}
+
+data_vio_count_t get_data_vio_pool_discard_limit(struct data_vio_pool *pool)
+{
+	return READ_ONCE(pool->discard_limiter.limit);
+}
+
+data_vio_count_t get_data_vio_pool_maximum_discards(struct data_vio_pool *pool)
+{
+	return READ_ONCE(pool->discard_limiter.max_busy);
+}
+
+int set_data_vio_pool_discard_limit(struct data_vio_pool *pool, data_vio_count_t limit)
+{
+	if (get_data_vio_pool_request_limit(pool) < limit)
+		// The discard limit may not be higher than the data_vio limit.
+		return -EINVAL;
+
+	spin_lock(&pool->lock);
+	pool->discard_limiter.limit = limit;
+	spin_unlock(&pool->lock);
+
+	return VDO_SUCCESS;
+}
+
+data_vio_count_t get_data_vio_pool_active_requests(struct data_vio_pool *pool)
+{
+	return READ_ONCE(pool->limiter.busy);
+}
+
+data_vio_count_t get_data_vio_pool_request_limit(struct data_vio_pool *pool)
+{
+	return READ_ONCE(pool->limiter.limit);
+}
+
+data_vio_count_t get_data_vio_pool_maximum_requests(struct data_vio_pool *pool)
+{
+	return READ_ONCE(pool->limiter.max_busy);
+}
+
+static void update_data_vio_error_stats(struct data_vio *data_vio)
+{
+	u8 index = 0;
+	static const char * const operations[] = {
+		[0] = "empty",
+		[1] = "read",
+		[2] = "write",
+		[3] = "read-modify-write",
+		[5] = "read+fua",
+		[6] = "write+fua",
+		[7] = "read-modify-write+fua",
+	};
+
+	if (data_vio->read)
+		index = 1;
+
+	if (data_vio->write)
+		index += 2;
+
+	if (data_vio->fua)
+		index += 4;
+
+	update_vio_error_stats(&data_vio->vio,
+			       "Completing %s vio for LBN %llu with error after %s",
+			       operations[index],
+			       (unsigned long long) data_vio->logical.lbn,
+			       get_data_vio_operation_name(data_vio));
+}
+
+static void perform_cleanup_stage(struct data_vio *data_vio, enum data_vio_cleanup_stage stage);
+
+/**
+ * release_allocated_lock() - Release the PBN lock and/or the reference on the allocated block at
+ *			      the end of processing a data_vio.
+ */
+static void release_allocated_lock(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+
+	assert_data_vio_in_allocated_zone(data_vio);
+	release_data_vio_allocation_lock(data_vio, false);
+	perform_cleanup_stage(data_vio, VIO_RELEASE_RECOVERY_LOCKS);
+}
+
+/** release_lock() - Release an uncontended LBN lock. */
+static void release_lock(struct data_vio *data_vio, struct lbn_lock *lock)
+{
+	struct int_map *lock_map = lock->zone->lbn_operations;
+	struct data_vio *lock_holder;
+
+	if (!lock->locked) {
+		/*  The lock is not locked, so it had better not be registered in the lock map. */
+		struct data_vio *lock_holder = vdo_int_map_get(lock_map, lock->lbn);
+
+		ASSERT_LOG_ONLY((data_vio != lock_holder),
+				"no logical block lock held for block %llu",
+				(unsigned long long) lock->lbn);
+		return;
+	}
+
+	/* Release the lock by removing the lock from the map. */
+	lock_holder = vdo_int_map_remove(lock_map, lock->lbn);
+	ASSERT_LOG_ONLY((data_vio == lock_holder),
+			"logical block lock mismatch for block %llu",
+			(unsigned long long) lock->lbn);
+	lock->locked = false;
+}
+
+/** transfer_lock() - Transfer a contended LBN lock to the eldest waiter. */
+static void transfer_lock(struct data_vio *data_vio, struct lbn_lock *lock)
+{
+	struct data_vio *lock_holder, *next_lock_holder;
+	int result;
+
+	ASSERT_LOG_ONLY(lock->locked, "lbn_lock with waiters is not locked");
+
+	/* Another data_vio is waiting for the lock, transfer it in a single lock map operation. */
+	next_lock_holder = waiter_as_data_vio(vdo_dequeue_next_waiter(&lock->waiters));
+
+	/* Transfer the remaining lock waiters to the next lock holder. */
+	vdo_transfer_all_waiters(&lock->waiters, &next_lock_holder->logical.waiters);
+
+	result = vdo_int_map_put(lock->zone->lbn_operations,
+				 lock->lbn,
+				 next_lock_holder,
+				 true,
+				 (void **) &lock_holder);
+	if (result != VDO_SUCCESS) {
+		continue_data_vio_with_error(next_lock_holder, result);
+		return;
+	}
+
+	ASSERT_LOG_ONLY((lock_holder == data_vio),
+			"logical block lock mismatch for block %llu",
+			(unsigned long long) lock->lbn);
+	lock->locked = false;
+
+	/*
+	 * If there are still waiters, other data_vios must be trying to get the lock we just
+	 * transferred. We must ensure that the new lock holder doesn't block in the packer.
+	 */
+	if (vdo_has_waiters(&next_lock_holder->logical.waiters))
+		cancel_data_vio_compression(next_lock_holder);
+
+	/*
+	 * Avoid stack overflow on lock transfer.
+	 * FIXME: this is only an issue in the 1 thread config.
+	 */
+	next_lock_holder->vio.completion.requeue = true;
+	launch_locked_request(next_lock_holder);
+}
+
+/**
+ * release_logical_lock() - Release the logical block lock and flush generation lock at the end of
+ *			    processing a data_vio.
+ */
+static void release_logical_lock(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+	struct lbn_lock *lock = &data_vio->logical;
+
+	assert_data_vio_in_logical_zone(data_vio);
+
+	if (vdo_has_waiters(&lock->waiters))
+		transfer_lock(data_vio, lock);
+	else
+		release_lock(data_vio, lock);
+
+	vdo_release_flush_generation_lock(data_vio);
+	perform_cleanup_stage(data_vio, VIO_CLEANUP_DONE);
+}
+
+/** clean_hash_lock() - Release the hash lock at the end of processing a data_vio. */
+static void clean_hash_lock(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+
+	assert_data_vio_in_hash_zone(data_vio);
+	if (completion->result != VDO_SUCCESS) {
+		vdo_clean_failed_hash_lock(data_vio);
+		return;
+	}
+
+	vdo_release_hash_lock(data_vio);
+	perform_cleanup_stage(data_vio, VIO_RELEASE_LOGICAL);
+}
+
+/**
+ * finish_cleanup() - Make some assertions about a data_vio which has finished cleaning up.
+ *
+ * If it is part of a multi-block discard, starts on the next block, otherwise, returns it to the
+ * pool.
+ */
+static void finish_cleanup(struct data_vio *data_vio)
+{
+	struct vdo_completion *completion = &data_vio->vio.completion;
+
+	ASSERT_LOG_ONLY(data_vio->allocation.lock == NULL,
+			"complete data_vio has no allocation lock");
+	ASSERT_LOG_ONLY(data_vio->hash_lock == NULL, "complete data_vio has no hash lock");
+	if ((data_vio->remaining_discard <= VDO_BLOCK_SIZE) ||
+	    (completion->result != VDO_SUCCESS)) {
+		struct data_vio_pool *pool = completion->vdo->data_vio_pool;
+
+		uds_funnel_queue_put(pool->queue, &completion->work_queue_entry_link);
+		schedule_releases(pool);
+		return;
+	}
+
+	data_vio->remaining_discard -= min_t(u32,
+					     data_vio->remaining_discard,
+					     VDO_BLOCK_SIZE - data_vio->offset);
+	data_vio->is_partial = (data_vio->remaining_discard < VDO_BLOCK_SIZE);
+	data_vio->read = data_vio->is_partial;
+	data_vio->offset = 0;
+	completion->requeue = true;
+	launch_data_vio(data_vio, data_vio->logical.lbn + 1);
+}
+
+/** perform_cleanup_stage() - Perform the next step in the process of cleaning up a data_vio. */
+static void perform_cleanup_stage(struct data_vio *data_vio, enum data_vio_cleanup_stage stage)
+{
+	struct vdo *vdo = vdo_from_data_vio(data_vio);
+
+	switch (stage) {
+	case VIO_RELEASE_HASH_LOCK:
+		if (data_vio->hash_lock != NULL) {
+			launch_data_vio_hash_zone_callback(data_vio, clean_hash_lock);
+			return;
+		}
+		fallthrough;
+
+	case VIO_RELEASE_ALLOCATED:
+		if (data_vio_has_allocation(data_vio)) {
+			launch_data_vio_allocated_zone_callback(data_vio, release_allocated_lock);
+			return;
+		}
+		fallthrough;
+
+	case VIO_RELEASE_RECOVERY_LOCKS:
+		if ((data_vio->recovery_sequence_number > 0) &&
+		    (READ_ONCE(vdo->read_only_notifier.read_only_error) == VDO_SUCCESS) &&
+		    (data_vio->vio.completion.result != VDO_READ_ONLY))
+			uds_log_warning("VDO not read-only when cleaning data_vio with RJ lock");
+		fallthrough;
+
+	case VIO_RELEASE_LOGICAL:
+		launch_data_vio_logical_callback(data_vio, release_logical_lock);
+		return;
+
+	default:
+		finish_cleanup(data_vio);
+	}
+}
+
+void complete_data_vio(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+
+	completion->error_handler = NULL;
+	data_vio->last_async_operation = VIO_ASYNC_OP_CLEANUP;
+	perform_cleanup_stage(data_vio,
+			      (data_vio->write ? VIO_CLEANUP_START : VIO_RELEASE_LOGICAL));
+}
+
+static void enter_read_only_mode(struct vdo_completion *completion)
+{
+	if (vdo_is_read_only(completion->vdo))
+		return;
+
+	if (completion->result != VDO_READ_ONLY) {
+		struct data_vio *data_vio = as_data_vio(completion);
+
+		uds_log_error_strerror(completion->result,
+				       "Preparing to enter read-only mode: data_vio for LBN %llu (becoming mapped to %llu, previously mapped to %llu, allocated %llu) is completing with a fatal error after operation %s",
+				       (unsigned long long) data_vio->logical.lbn,
+				       (unsigned long long) data_vio->new_mapped.pbn,
+				       (unsigned long long) data_vio->mapped.pbn,
+				       (unsigned long long) data_vio->allocation.pbn,
+				       get_data_vio_operation_name(data_vio));
+	}
+
+	vdo_enter_read_only_mode(completion->vdo, completion->result);
+}
+
+void handle_data_vio_error(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+
+	if ((completion->result == VDO_READ_ONLY) || (data_vio->user_bio == NULL))
+		enter_read_only_mode(completion);
+
+	update_data_vio_error_stats(data_vio);
+	complete_data_vio(completion);
+}
+
+/**
+ * get_data_vio_operation_name() - Get the name of the last asynchronous operation performed on a
+ *				   data_vio.
+ */
+const char *get_data_vio_operation_name(struct data_vio *data_vio)
+{
+	STATIC_ASSERT((MAX_VIO_ASYNC_OPERATION_NUMBER - MIN_VIO_ASYNC_OPERATION_NUMBER) ==
+		      ARRAY_SIZE(ASYNC_OPERATION_NAMES));
+
+	return ((data_vio->last_async_operation < MAX_VIO_ASYNC_OPERATION_NUMBER) ?
+		ASYNC_OPERATION_NAMES[data_vio->last_async_operation] :
+		"unknown async operation");
+}
+
+/**
+ * data_vio_allocate_data_block() - Allocate a data block.
+ *
+ * @write_lock_type: The type of write lock to obtain on the block.
+ * @callback: The callback which will attempt an allocation in the current zone and continue if it
+ *	      succeeds.
+ * @error_handler: The handler for errors while allocating.
+ */
+void data_vio_allocate_data_block(struct data_vio *data_vio,
+				  enum pbn_lock_type write_lock_type,
+				  vdo_action *callback,
+				  vdo_action *error_handler)
+{
+	struct allocation *allocation = &data_vio->allocation;
+
+	ASSERT_LOG_ONLY((allocation->pbn == VDO_ZERO_BLOCK),
+			"data_vio does not have an allocation");
+	allocation->write_lock_type = write_lock_type;
+	allocation->zone = vdo_get_next_allocation_zone(data_vio->logical.zone);
+	allocation->first_allocation_zone = allocation->zone->zone_number;
+
+	data_vio->vio.completion.error_handler = error_handler;
+	launch_data_vio_allocated_zone_callback(data_vio, callback);
+}
+
+void release_data_vio_allocation_lock(struct data_vio *data_vio, bool reset)
+{
+	struct allocation *allocation = &data_vio->allocation;
+	physical_block_number_t locked_pbn = allocation->pbn;
+
+	assert_data_vio_in_allocated_zone(data_vio);
+
+	if (reset || vdo_pbn_lock_has_provisional_reference(allocation->lock))
+		allocation->pbn = VDO_ZERO_BLOCK;
+
+	vdo_release_physical_zone_pbn_lock(allocation->zone,
+					   locked_pbn,
+					   UDS_FORGET(allocation->lock));
+}
+
+/**
+ * uncompress_data_vio() - Uncompress the data a data_vio has just read.
+ * @mapping_state: The mapping state indicating which fragment to decompress.
+ * @buffer: The buffer to receive the uncompressed data.
+ */
+int uncompress_data_vio(struct data_vio *data_vio,
+			enum block_mapping_state mapping_state,
+			char *buffer)
+{
+	int size;
+	u16 fragment_offset, fragment_size;
+	struct compressed_block *block = data_vio->compression.block;
+	int result = vdo_get_compressed_block_fragment(mapping_state,
+						       block,
+						       &fragment_offset,
+						       &fragment_size);
+
+	if (result != VDO_SUCCESS) {
+		uds_log_debug("%s: compressed fragment error %d", __func__, result);
+		return result;
+	}
+
+	size = LZ4_decompress_safe((block->data + fragment_offset),
+				   buffer,
+				   fragment_size,
+				   VDO_BLOCK_SIZE);
+	if (size != VDO_BLOCK_SIZE) {
+		uds_log_debug("%s: lz4 error", __func__);
+		return VDO_INVALID_FRAGMENT;
+	}
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * modify_for_partial_write() - Do the modify-write part of a read-modify-write cycle.
+ * @completion: The data_vio which has just finished its read.
+ *
+ * This callback is registered in read_block().
+ */
+static void modify_for_partial_write(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+	char *data = data_vio->vio.data;
+	struct bio *bio = data_vio->user_bio;
+
+	assert_data_vio_on_cpu_thread(data_vio);
+
+	if (bio_op(bio) == REQ_OP_DISCARD) {
+		memset(data + data_vio->offset, '\0', min_t(u32,
+							    data_vio->remaining_discard,
+							    VDO_BLOCK_SIZE - data_vio->offset));
+	} else {
+		copy_from_bio(bio, data + data_vio->offset);
+	}
+
+	data_vio->is_zero = is_zero_block(data);
+	data_vio->read = false;
+	launch_data_vio_logical_callback(data_vio, continue_data_vio_with_block_map_slot);
+}
+
+static void complete_read(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+	char *data = data_vio->vio.data;
+	bool compressed = vdo_is_state_compressed(data_vio->mapped.state);
+
+	assert_data_vio_on_cpu_thread(data_vio);
+
+	if (compressed) {
+		int result = uncompress_data_vio(data_vio, data_vio->mapped.state, data);
+
+		if (result != VDO_SUCCESS) {
+			continue_data_vio_with_error(data_vio, result);
+			return;
+		}
+	}
+
+	if (data_vio->write) {
+		modify_for_partial_write(completion);
+		return;
+	}
+
+	if (compressed || data_vio->is_partial)
+		copy_to_bio(data_vio->user_bio, data + data_vio->offset);
+
+	acknowledge_data_vio(data_vio);
+	complete_data_vio(completion);
+}
+
+static void read_endio(struct bio *bio)
+{
+	struct data_vio *data_vio = vio_as_data_vio(bio->bi_private);
+	int result = blk_status_to_errno(bio->bi_status);
+
+	vdo_count_completed_bios(bio);
+	if (result != VDO_SUCCESS) {
+		continue_data_vio_with_error(data_vio, result);
+		return;
+	}
+
+	launch_data_vio_cpu_callback(data_vio, complete_read, CPU_Q_COMPLETE_READ_PRIORITY);
+}
+
+static void complete_zero_read(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+
+	assert_data_vio_on_cpu_thread(data_vio);
+
+	if (data_vio->is_partial) {
+		memset(data_vio->vio.data, 0, VDO_BLOCK_SIZE);
+		if (data_vio->write) {
+			modify_for_partial_write(completion);
+			return;
+		}
+	} else {
+		zero_fill_bio(data_vio->user_bio);
+	}
+
+	complete_read(completion);
+}
+
+/**
+ * read_block() - Read a block asynchronously.
+ *
+ * This is the callback registered in read_block_mapping().
+ */
+static void read_block(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+	struct vio *vio = as_vio(completion);
+	int result = VDO_SUCCESS;
+
+	if (data_vio->mapped.pbn == VDO_ZERO_BLOCK) {
+		launch_data_vio_cpu_callback(data_vio,
+					     complete_zero_read,
+					     CPU_Q_COMPLETE_VIO_PRIORITY);
+		return;
+	}
+
+	data_vio->last_async_operation = VIO_ASYNC_OP_READ_DATA_VIO;
+	if (vdo_is_state_compressed(data_vio->mapped.state)) {
+		result = vio_reset_bio(vio,
+				       (char *) data_vio->compression.block,
+				       read_endio,
+				       REQ_OP_READ,
+				       data_vio->mapped.pbn);
+	} else {
+		int opf = ((data_vio->user_bio->bi_opf & PASSTHROUGH_FLAGS) | REQ_OP_READ);
+
+		if (data_vio->is_partial) {
+			result = vio_reset_bio(vio,
+					       vio->data,
+					       read_endio,
+					       opf,
+					       data_vio->mapped.pbn);
+		} else {
+			/* A full 4k read. Use the incoming bio to avoid having to copy the data */
+			bio_reset(vio->bio, vio->bio->bi_bdev, opf);
+			bio_init_clone(data_vio->user_bio->bi_bdev,
+				       vio->bio,
+				       data_vio->user_bio,
+				       GFP_KERNEL);
+
+			/* Copy over the original bio iovec and opflags. */
+			vdo_set_bio_properties(vio->bio,
+					       vio,
+					       read_endio,
+					       opf,
+					       data_vio->mapped.pbn);
+		}
+	}
+
+	if (result != VDO_SUCCESS) {
+		continue_data_vio_with_error(data_vio, result);
+		return;
+	}
+
+	submit_data_vio_io(data_vio);
+}
+
+static inline struct data_vio *
+reference_count_update_completion_as_data_vio(struct vdo_completion *completion)
+{
+	if (completion->type == VIO_COMPLETION)
+		return as_data_vio(completion);
+
+	return container_of(completion, struct data_vio, decrement_completion);
+}
+
+/**
+ * update_block_map() - Rendezvous of the data_vio and decrement completions after each has
+ *                      made its reference updates. Handle any error from either, or proceed
+ *                      to updating the block map.
+ * @completion: The completion of the write in progress.
+ */
+static void update_block_map(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = reference_count_update_completion_as_data_vio(completion);
+
+	assert_data_vio_in_logical_zone(data_vio);
+
+	if (!data_vio->first_reference_operation_complete) {
+		/* Rendezvous, we're first */
+		data_vio->first_reference_operation_complete = true;
+		return;
+	}
+
+	completion = &data_vio->vio.completion;
+	vdo_set_completion_result(completion, data_vio->decrement_completion.result);
+	if (completion->result != VDO_SUCCESS) {
+		handle_data_vio_error(completion);
+		return;
+	}
+
+	completion->error_handler = handle_data_vio_error;
+	if (data_vio->hash_lock != NULL)
+		set_data_vio_hash_zone_callback(data_vio, vdo_continue_hash_lock);
+	else
+		completion->callback = complete_data_vio;
+
+	data_vio->last_async_operation = VIO_ASYNC_OP_PUT_MAPPED_BLOCK;
+	vdo_put_mapped_block(data_vio);
+}
+
+static void decrement_reference_count(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = container_of(completion,
+						 struct data_vio,
+						 decrement_completion);
+
+	assert_data_vio_in_mapped_zone(data_vio);
+
+	vdo_set_completion_callback(completion,
+				    update_block_map,
+				    data_vio->logical.zone->thread_id);
+	completion->error_handler = update_block_map;
+	vdo_modify_reference_count(completion, &data_vio->decrement_updater);
+}
+
+static void increment_reference_count(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+
+	assert_data_vio_in_new_mapped_zone(data_vio);
+
+	if (data_vio->downgrade_allocation_lock) {
+		/*
+		 * Now that the data has been written, it's safe to deduplicate against the
+		 * block. Downgrade the allocation lock to a read lock so it can be used later by
+		 * the hash lock. This is done here since it needs to happen sometime before we
+		 * return to the hash zone, and we are currently on the correct thread. For
+		 * compressed blocks, the downgrade will have already been done.
+		 */
+		vdo_downgrade_pbn_write_lock(data_vio->allocation.lock, false);
+	}
+
+	set_data_vio_logical_callback(data_vio, update_block_map);
+	completion->error_handler = update_block_map;
+	vdo_modify_reference_count(completion, &data_vio->increment_updater);
+}
+
+/** journal_remapping() - Add a recovery journal entry for a data remapping. */
+static void journal_remapping(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+
+	assert_data_vio_in_journal_zone(data_vio);
+
+	data_vio->decrement_updater.operation = VDO_JOURNAL_DATA_REMAPPING;
+	data_vio->decrement_updater.zpbn = data_vio->mapped;
+	if (data_vio->new_mapped.pbn == VDO_ZERO_BLOCK) {
+		data_vio->first_reference_operation_complete = true;
+		if (data_vio->mapped.pbn == VDO_ZERO_BLOCK)
+			set_data_vio_logical_callback(data_vio, update_block_map);
+	} else {
+		set_data_vio_new_mapped_zone_callback(data_vio, increment_reference_count);
+	}
+
+	if (data_vio->mapped.pbn == VDO_ZERO_BLOCK)
+		data_vio->first_reference_operation_complete = true;
+	else
+		vdo_set_completion_callback(&data_vio->decrement_completion,
+					    decrement_reference_count,
+					    data_vio->mapped.zone->thread_id);
+
+	data_vio->last_async_operation = VIO_ASYNC_OP_JOURNAL_REMAPPING;
+	vdo_add_recovery_journal_entry(completion->vdo->recovery_journal, data_vio);
+}
+
+/**
+ * read_old_block_mapping() - Get the previous PBN/LBN mapping of an in-progress write.
+ *
+ * Gets the previous PBN mapped to this LBN from the block map, so as to make an appropriate
+ * journal entry referencing the removal of this LBN->PBN mapping.
+ */
+static void read_old_block_mapping(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+
+	assert_data_vio_in_logical_zone(data_vio);
+
+	data_vio->last_async_operation = VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_WRITE;
+	set_data_vio_journal_callback(data_vio, journal_remapping);
+	vdo_get_mapped_block(data_vio);
+}
+
+void update_metadata_for_data_vio_write(struct data_vio *data_vio, struct pbn_lock *lock)
+{
+	data_vio->increment_updater = (struct reference_updater) {
+		.operation = VDO_JOURNAL_DATA_REMAPPING,
+		.increment = true,
+		.zpbn = data_vio->new_mapped,
+		.lock = lock,
+	};
+
+	launch_data_vio_logical_callback(data_vio, read_old_block_mapping);
+}
+
+/**
+ * pack_compressed_data() - Attempt to pack the compressed data_vio into a block.
+ *
+ * This is the callback registered in launch_compress_data_vio().
+ */
+static void pack_compressed_data(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+
+	assert_data_vio_in_packer_zone(data_vio);
+
+	if (!vdo_get_compressing(vdo_from_data_vio(data_vio)) ||
+	    get_data_vio_compression_status(data_vio).may_not_compress) {
+		write_data_vio(data_vio);
+		return;
+	}
+
+	data_vio->last_async_operation = VIO_ASYNC_OP_ATTEMPT_PACKING;
+	vdo_attempt_packing(data_vio);
+}
+
+/**
+ * compress_data_vio() - Do the actual work of compressing the data on a CPU queue.
+ *
+ * This callback is registered in launch_compress_data_vio().
+ */
+static void compress_data_vio(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+	int size;
+
+	assert_data_vio_on_cpu_thread(data_vio);
+
+	/*
+	 * By putting the compressed data at the start of the compressed block data field, we won't
+	 * need to copy it if this data_vio becomes a compressed write agent.
+	 */
+	size = LZ4_compress_default(data_vio->vio.data,
+				    data_vio->compression.block->data,
+				    VDO_BLOCK_SIZE,
+				    VDO_MAX_COMPRESSED_FRAGMENT_SIZE,
+				    (char *) vdo_get_work_queue_private_data());
+	if ((size > 0) && (size < VDO_COMPRESSED_BLOCK_DATA_SIZE)) {
+		data_vio->compression.size = size;
+		launch_data_vio_packer_callback(data_vio, pack_compressed_data);
+		return;
+	}
+
+	write_data_vio(data_vio);
+}
+
+/**
+ * launch_compress_data_vio() - Continue a write by attempting to compress the data.
+ *
+ * This is a re-entry point to vio_write used by hash locks.
+ */
+void launch_compress_data_vio(struct data_vio *data_vio)
+{
+	ASSERT_LOG_ONLY(!data_vio->is_duplicate, "compressing a non-duplicate block");
+	ASSERT_LOG_ONLY(data_vio->hash_lock != NULL, "data_vio to compress has a hash_lock");
+	ASSERT_LOG_ONLY(data_vio_has_allocation(data_vio),
+			"data_vio to compress has an allocation");
+
+	/*
+	 * There are 4 reasons why a data_vio which has reached this point will not be eligible for
+	 * compression:
+	 *
+	 * 1) Since data_vios can block indefinitely in the packer, it would be bad to do so if the
+	 * write request also requests FUA.
+	 *
+	 * 2) A data_vio should not be compressed when compression is disabled for the vdo.
+	 *
+	 * 3) A data_vio could be doing a partial write on behalf of a larger discard which has not
+	 * yet been acknowledged and hence blocking in the packer would be bad.
+	 *
+	 * 4) Some other data_vio may be waiting on this data_vio in which case blocking in the
+	 * packer would also be bad.
+	 */
+	if (data_vio->fua ||
+	    !vdo_get_compressing(vdo_from_data_vio(data_vio)) ||
+	    ((data_vio->user_bio != NULL) && (bio_op(data_vio->user_bio) == REQ_OP_DISCARD)) ||
+	    (advance_data_vio_compression_stage(data_vio).stage != DATA_VIO_COMPRESSING)) {
+		write_data_vio(data_vio);
+		return;
+	}
+
+	data_vio->last_async_operation = VIO_ASYNC_OP_COMPRESS_DATA_VIO;
+	launch_data_vio_cpu_callback(data_vio, compress_data_vio, CPU_Q_COMPRESS_BLOCK_PRIORITY);
+}
+
+/**
+ * hash_data_vio() - Hash the data in a data_vio and set the hash zone (which also flags the record
+ *		     name as set).
+
+ * This callback is registered in prepare_for_dedupe().
+ */
+static void hash_data_vio(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+
+	assert_data_vio_on_cpu_thread(data_vio);
+	ASSERT_LOG_ONLY(!data_vio->is_zero, "zero blocks should not be hashed");
+
+	murmurhash3_128(data_vio->vio.data,
+			VDO_BLOCK_SIZE,
+			0x62ea60be,
+			&data_vio->record_name);
+
+	data_vio->hash_zone = vdo_select_hash_zone(vdo_from_data_vio(data_vio)->hash_zones,
+						   &data_vio->record_name);
+	data_vio->last_async_operation = VIO_ASYNC_OP_ACQUIRE_VDO_HASH_LOCK;
+	launch_data_vio_hash_zone_callback(data_vio, vdo_acquire_hash_lock);
+}
+
+/** prepare_for_dedupe() - Prepare for the dedupe path after attempting to get an allocation. */
+static void prepare_for_dedupe(struct data_vio *data_vio)
+{
+	/* We don't care what thread we are on. */
+	ASSERT_LOG_ONLY(!data_vio->is_zero, "must not prepare to dedupe zero blocks");
+
+	/*
+	 * Before we can dedupe, we need to know the record name, so the first
+	 * step is to hash the block data.
+	 */
+	data_vio->last_async_operation = VIO_ASYNC_OP_HASH_DATA_VIO;
+	launch_data_vio_cpu_callback(data_vio, hash_data_vio, CPU_Q_HASH_BLOCK_PRIORITY);
+}
+
+/**
+ * write_bio_finished() - This is the bio_end_io function registered in write_block() to be called
+ *			  when a data_vio's write to the underlying storage has completed.
+ */
+static void write_bio_finished(struct bio *bio)
+{
+	struct data_vio *data_vio = vio_as_data_vio((struct vio *) bio->bi_private);
+
+	vdo_count_completed_bios(bio);
+	vdo_set_completion_result(&data_vio->vio.completion, blk_status_to_errno(bio->bi_status));
+	data_vio->downgrade_allocation_lock = true;
+	update_metadata_for_data_vio_write(data_vio, data_vio->allocation.lock);
+}
+
+/** write_data_vio() - Write a data block to storage without compression. */
+void write_data_vio(struct data_vio *data_vio)
+{
+	struct data_vio_compression_status status, new_status;
+	int result;
+
+	if (!data_vio_has_allocation(data_vio)) {
+		/*
+		 * There was no space to write this block and we failed to deduplicate or compress
+		 * it.
+		 */
+		continue_data_vio_with_error(data_vio, VDO_NO_SPACE);
+		return;
+	}
+
+	new_status = (struct data_vio_compression_status) {
+		.stage = DATA_VIO_POST_PACKER,
+		.may_not_compress = true,
+	};
+
+	do {
+		status = get_data_vio_compression_status(data_vio);
+	} while ((status.stage != DATA_VIO_POST_PACKER) &&
+		 !set_data_vio_compression_status(data_vio, status, new_status));
+
+	/* Write the data from the data block buffer. */
+	result = vio_reset_bio(&data_vio->vio,
+			       data_vio->vio.data,
+			       write_bio_finished,
+			       REQ_OP_WRITE,
+			       data_vio->allocation.pbn);
+	if (result != VDO_SUCCESS) {
+		continue_data_vio_with_error(data_vio, result);
+		return;
+	}
+
+	data_vio->last_async_operation = VIO_ASYNC_OP_WRITE_DATA_VIO;
+	submit_data_vio_io(data_vio);
+}
+
+/**
+ * acknowledge_write_callback() - Acknowledge a write to the requestor.
+ *
+ * This callback is registered in allocate_block() and continue_write_with_block_map_slot().
+ */
+static void acknowledge_write_callback(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+	struct vdo *vdo = completion->vdo;
+
+	ASSERT_LOG_ONLY((!vdo_uses_bio_ack_queue(vdo) ||
+			 (vdo_get_callback_thread_id() == vdo->thread_config.bio_ack_thread)),
+			"%s() called on bio ack queue",
+			__func__);
+	ASSERT_LOG_ONLY(data_vio_has_flush_generation_lock(data_vio),
+			"write VIO to be acknowledged has a flush generation lock");
+	acknowledge_data_vio(data_vio);
+	if (data_vio->new_mapped.pbn == VDO_ZERO_BLOCK) {
+		/* This is a zero write or discard */
+		update_metadata_for_data_vio_write(data_vio, NULL);
+		return;
+	}
+
+	prepare_for_dedupe(data_vio);
+}
+
+/**
+ * allocate_block() - Attempt to allocate a block in the current allocation zone.
+ *
+ * This callback is registered in continue_write_with_block_map_slot().
+ */
+static void allocate_block(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+
+	assert_data_vio_in_allocated_zone(data_vio);
+
+	if (!vdo_allocate_block_in_zone(data_vio))
+		return;
+
+	completion->error_handler = handle_data_vio_error;
+	WRITE_ONCE(data_vio->allocation_succeeded, true);
+	data_vio->new_mapped = (struct zoned_pbn) {
+		.zone = data_vio->allocation.zone,
+		.pbn = data_vio->allocation.pbn,
+		.state = VDO_MAPPING_STATE_UNCOMPRESSED,
+	};
+
+	if (data_vio->fua) {
+		prepare_for_dedupe(data_vio);
+		return;
+	}
+
+	data_vio->last_async_operation = VIO_ASYNC_OP_ACKNOWLEDGE_WRITE;
+	launch_data_vio_on_bio_ack_queue(data_vio, acknowledge_write_callback);
+}
+
+/**
+ * handle_allocation_error() - Handle an error attempting to allocate a block.
+ *
+ * This error handler is registered in continue_write_with_block_map_slot().
+ */
+static void handle_allocation_error(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+
+	if (completion->result == VDO_NO_SPACE) {
+		/* We failed to get an allocation, but we can try to dedupe. */
+		vdo_reset_completion(completion);
+		completion->error_handler = handle_data_vio_error;
+		prepare_for_dedupe(data_vio);
+		return;
+	}
+
+	/* We got a "real" error, not just a failure to allocate, so fail the request. */
+	handle_data_vio_error(completion);
+}
+
+static int assert_is_trim(struct data_vio *data_vio)
+{
+	int result = ASSERT(data_vio->is_trim, "data_vio with no block map page is a trim");
+
+	return ((result == VDO_SUCCESS) ? result : VDO_READ_ONLY);
+}
+
+/**
+ * continue_data_vio_with_block_map_slot() - Read the data_vio's mapping from the block map.
+ *
+ * This callback is registered in launch_read_data_vio().
+ */
+void continue_data_vio_with_block_map_slot(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+
+	assert_data_vio_in_logical_zone(data_vio);
+	if (data_vio->read) {
+		set_data_vio_logical_callback(data_vio, read_block);
+		data_vio->last_async_operation = VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_READ;
+		vdo_get_mapped_block(data_vio);
+		return;
+	}
+
+	vdo_acquire_flush_generation_lock(data_vio);
+
+	if (data_vio->tree_lock.tree_slots[0].block_map_slot.pbn == VDO_ZERO_BLOCK) {
+		/*
+		 * This is a trim for a block on a block map page which has not been allocated, so
+		 * there's nothing more we need to do.
+		 */
+		completion->callback = complete_data_vio;
+		continue_data_vio_with_error(data_vio, assert_is_trim(data_vio));
+		return;
+	}
+
+	/*
+	 * We need an allocation if this is neither a full-block trim nor a
+	 * full-block zero write.
+	 */
+	if (!data_vio->is_zero && (!data_vio->is_trim || data_vio->is_partial)) {
+		data_vio_allocate_data_block(data_vio,
+					     VIO_WRITE_LOCK,
+					     allocate_block,
+					     handle_allocation_error);
+		return;
+	}
+
+
+	/*
+	 * We don't need to write any data, so skip allocation and just update the block map and
+	 * reference counts (via the journal).
+	 */
+	data_vio->new_mapped.pbn = VDO_ZERO_BLOCK;
+	if (data_vio->is_zero)
+		data_vio->new_mapped.state = VDO_MAPPING_STATE_UNCOMPRESSED;
+
+	if (data_vio->remaining_discard > VDO_BLOCK_SIZE) {
+		/* This is not the final block of a discard so we can't acknowledge it yet. */
+		update_metadata_for_data_vio_write(data_vio, NULL);
+		return;
+	}
+
+	data_vio->last_async_operation = VIO_ASYNC_OP_ACKNOWLEDGE_WRITE;
+	launch_data_vio_on_bio_ack_queue(data_vio, acknowledge_write_callback);
+}
diff --git a/drivers/md/dm-vdo/data-vio.h b/drivers/md/dm-vdo/data-vio.h
new file mode 100644
index 00000000000..0049d6ed0a8
--- /dev/null
+++ b/drivers/md/dm-vdo/data-vio.h
@@ -0,0 +1,689 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef DATA_VIO_H
+#define DATA_VIO_H
+
+#include <linux/atomic.h>
+#include <linux/bio.h>
+#include <linux/list.h>
+
+#include "permassert.h"
+#include "uds.h"
+
+#include "block-map.h"
+#include "completion.h"
+#include "constants.h"
+#include "dedupe.h"
+#include "encodings.h"
+#include "logical-zone.h"
+#include "physical-zone.h"
+#include "types.h"
+#include "vdo.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+/* Codes for describing the last asynchronous operation performed on a vio. */
+enum async_operation_number {
+	MIN_VIO_ASYNC_OPERATION_NUMBER,
+	VIO_ASYNC_OP_LAUNCH = MIN_VIO_ASYNC_OPERATION_NUMBER,
+	VIO_ASYNC_OP_ACKNOWLEDGE_WRITE,
+	VIO_ASYNC_OP_ACQUIRE_VDO_HASH_LOCK,
+	VIO_ASYNC_OP_ATTEMPT_LOGICAL_BLOCK_LOCK,
+	VIO_ASYNC_OP_LOCK_DUPLICATE_PBN,
+	VIO_ASYNC_OP_CHECK_FOR_DUPLICATION,
+	VIO_ASYNC_OP_CLEANUP,
+	VIO_ASYNC_OP_COMPRESS_DATA_VIO,
+	VIO_ASYNC_OP_FIND_BLOCK_MAP_SLOT,
+	VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_READ,
+	VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_WRITE,
+	VIO_ASYNC_OP_HASH_DATA_VIO,
+	VIO_ASYNC_OP_JOURNAL_REMAPPING,
+	VIO_ASYNC_OP_ATTEMPT_PACKING,
+	VIO_ASYNC_OP_PUT_MAPPED_BLOCK,
+	VIO_ASYNC_OP_READ_DATA_VIO,
+	VIO_ASYNC_OP_UPDATE_DEDUPE_INDEX,
+	VIO_ASYNC_OP_UPDATE_REFERENCE_COUNTS,
+	VIO_ASYNC_OP_VERIFY_DUPLICATION,
+	VIO_ASYNC_OP_WRITE_DATA_VIO,
+	MAX_VIO_ASYNC_OPERATION_NUMBER,
+} __packed;
+
+struct lbn_lock {
+	logical_block_number_t lbn;
+	bool locked;
+	struct wait_queue waiters;
+	struct logical_zone *zone;
+};
+
+/* A position in the arboreal block map at a specific level. */
+struct block_map_tree_slot {
+	page_number_t page_index;
+	struct block_map_slot block_map_slot;
+};
+
+/* Fields for using the arboreal block map. */
+struct tree_lock {
+	/* The current height at which this data_vio is operating */
+	height_t height;
+	/* The block map tree for this LBN */
+	root_count_t root_index;
+	/* Whether we hold a page lock */
+	bool locked;
+	/* The key for the lock map */
+	u64 key;
+	/* The queue of waiters for the page this vio is allocating or loading */
+	struct wait_queue waiters;
+	/* The block map tree slots for this LBN */
+	struct block_map_tree_slot tree_slots[VDO_BLOCK_MAP_TREE_HEIGHT + 1];
+};
+
+struct zoned_pbn {
+	physical_block_number_t pbn;
+	enum block_mapping_state state;
+	struct physical_zone *zone;
+};
+
+/*
+ * Where a data_vio is on the compression path; advance_compression_stage() depends on the order of
+ * this enum.
+ */
+enum data_vio_compression_stage {
+	/* A data_vio which has not yet entered the compression path */
+	DATA_VIO_PRE_COMPRESSOR,
+	/* A data_vio which is in the compressor */
+	DATA_VIO_COMPRESSING,
+	/* A data_vio which is blocked in the packer */
+	DATA_VIO_PACKING,
+	/* A data_vio which is no longer on the compression path (and never will be) */
+	DATA_VIO_POST_PACKER,
+};
+
+struct data_vio_compression_status {
+	enum data_vio_compression_stage stage;
+	bool may_not_compress;
+};
+
+struct compression_state {
+	/*
+	 * The current compression status of this data_vio. This field contains a value which
+	 * consists of a data_vio_compression_stage and a flag indicating whether a request has
+	 * been made to cancel (or prevent) compression for this data_vio.
+	 *
+	 * This field should be accessed through the get_data_vio_compression_status() and
+	 * set_data_vio_compression_status() methods. It should not be accessed directly.
+	 */
+	atomic_t status;
+
+	/* The compressed size of this block */
+	u16 size;
+
+	/* The packer input or output bin slot which holds the enclosing data_vio */
+	slot_number_t slot;
+
+	/* The packer bin to which the enclosing data_vio has been assigned */
+	struct packer_bin *bin;
+
+	/* A link in the chain of data_vios which have been packed together */
+	struct data_vio *next_in_batch;
+
+	/* A vio which is blocked in the packer while holding a lock this vio needs. */
+	struct data_vio *lock_holder;
+
+	/*
+	 * The compressed block used to hold the compressed form of this block and that of any
+	 * other blocks for which this data_vio is the compressed write agent.
+	 */
+	struct compressed_block *block;
+};
+
+/* Fields supporting allocation of data blocks. */
+struct allocation {
+	/* The physical zone in which to allocate a physical block */
+	struct physical_zone *zone;
+
+	/* The block allocated to this vio */
+	physical_block_number_t pbn;
+
+	/*
+	 * If non-NULL, the pooled PBN lock held on the allocated block. Must be a write lock until
+	 * the block has been written, after which it will become a read lock.
+	 */
+	struct pbn_lock *lock;
+
+	/* The type of write lock to obtain on the allocated block */
+	enum pbn_lock_type write_lock_type;
+
+	/* The zone which was the start of the current allocation cycle */
+	zone_count_t first_allocation_zone;
+
+	/* Whether this vio should wait for a clean slab */
+	bool wait_for_clean_slab;
+};
+
+struct reference_updater {
+	enum journal_operation operation;
+	bool increment;
+	struct zoned_pbn zpbn;
+	struct pbn_lock *lock;
+	struct waiter waiter;
+};
+
+/* A vio for processing user data requests. */
+struct data_vio {
+	/* The wait_queue entry structure */
+	struct waiter waiter;
+
+	/* The logical block of this request */
+	struct lbn_lock logical;
+
+	/* The state for traversing the block map tree */
+	struct tree_lock tree_lock;
+
+	/* The current partition address of this block */
+	struct zoned_pbn mapped;
+
+	/* The hash of this vio (if not zero) */
+	struct uds_record_name record_name;
+
+	/* Used for logging and debugging */
+	enum async_operation_number last_async_operation;
+
+	/* The operations to record in the recovery and slab journals */
+	struct reference_updater increment_updater;
+	struct reference_updater decrement_updater;
+
+	u16 read : 1;
+	u16 write : 1;
+	u16 fua : 1;
+	u16 is_zero : 1;
+	u16 is_trim : 1;
+	u16 is_partial : 1;
+	u16 is_duplicate : 1;
+	u16 first_reference_operation_complete : 1;
+	u16 downgrade_allocation_lock : 1;
+
+	struct allocation allocation;
+
+	/*
+	 * Whether this vio has received an allocation. This field is examined from threads not in
+	 * the allocation zone.
+	 */
+	bool allocation_succeeded;
+
+	/* The new partition address of this block after the vio write completes */
+	struct zoned_pbn new_mapped;
+
+	/* The hash zone responsible for the name (NULL if is_zero_block) */
+	struct hash_zone *hash_zone;
+
+	/* The lock this vio holds or shares with other vios with the same data */
+	struct hash_lock *hash_lock;
+
+	/* All data_vios sharing a hash lock are kept in a list linking these list entries */
+	struct list_head hash_lock_entry;
+
+	/* The block number in the partition of the UDS deduplication advice */
+	struct zoned_pbn duplicate;
+
+	/*
+	 * The sequence number of the recovery journal block containing the increment entry for
+	 * this vio.
+	 */
+	sequence_number_t recovery_sequence_number;
+
+	/* The point in the recovery journal where this write last made an entry */
+	struct journal_point recovery_journal_point;
+
+	/* The list of vios in user initiated write requests */
+	struct list_head write_entry;
+
+	/* The generation number of the VDO that this vio belongs to */
+	sequence_number_t flush_generation;
+
+	/* The completion to use for fetching block map pages for this vio */
+	struct vdo_page_completion page_completion;
+
+	/* The user bio that initiated this VIO */
+	struct bio *user_bio;
+
+	/* partial block support */
+	block_size_t offset;
+
+	/*
+	 * The number of bytes to be discarded. For discards, this field will always be positive,
+	 * whereas for non-discards it will always be 0. Hence it can be used to determine whether
+	 * a data_vio is processing a discard, even after the user_bio has been acknowledged.
+	 */
+	u32 remaining_discard;
+
+	struct dedupe_context *dedupe_context;
+
+	/* Fields beyond this point will not be reset when a pooled data_vio is reused. */
+
+	struct vio vio;
+
+	/* The completion for making reference count decrements */
+	struct vdo_completion decrement_completion;
+
+	/* All of the fields necessary for the compression path */
+	struct compression_state compression;
+
+	/* A block used as output during compression or uncompression */
+	char *scratch_block;
+
+	struct list_head pool_entry;
+};
+
+static inline struct data_vio *vio_as_data_vio(struct vio *vio)
+{
+	ASSERT_LOG_ONLY((vio->type == VIO_TYPE_DATA), "vio is a data_vio");
+	return container_of(vio, struct data_vio, vio);
+}
+
+static inline struct data_vio *as_data_vio(struct vdo_completion *completion)
+{
+	return vio_as_data_vio(as_vio(completion));
+}
+
+static inline struct data_vio *waiter_as_data_vio(struct waiter *waiter)
+{
+	if (waiter == NULL)
+		return NULL;
+
+	return container_of(waiter, struct data_vio, waiter);
+}
+
+static inline struct data_vio *
+data_vio_from_reference_updater(struct reference_updater *updater)
+{
+	if (updater->increment)
+		return container_of(updater, struct data_vio, increment_updater);
+
+	return container_of(updater, struct data_vio, decrement_updater);
+}
+
+static inline bool data_vio_has_flush_generation_lock(struct data_vio *data_vio)
+{
+	return !list_empty(&data_vio->write_entry);
+}
+
+static inline struct vdo *vdo_from_data_vio(struct data_vio *data_vio)
+{
+	return data_vio->vio.completion.vdo;
+}
+
+static inline bool data_vio_has_allocation(struct data_vio *data_vio)
+{
+	return (data_vio->allocation.pbn != VDO_ZERO_BLOCK);
+}
+
+struct data_vio_compression_status __must_check
+advance_data_vio_compression_stage(struct data_vio *data_vio);
+struct data_vio_compression_status __must_check
+get_data_vio_compression_status(struct data_vio *data_vio);
+bool cancel_data_vio_compression(struct data_vio *data_vio);
+
+struct data_vio_pool;
+
+int make_data_vio_pool(struct vdo *vdo,
+		       data_vio_count_t pool_size,
+		       data_vio_count_t discard_limit,
+		       struct data_vio_pool **pool_ptr);
+void free_data_vio_pool(struct data_vio_pool *pool);
+void vdo_launch_bio(struct data_vio_pool *pool, struct bio *bio);
+void drain_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *completion);
+void resume_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *completion);
+
+void dump_data_vio_pool(struct data_vio_pool *pool, bool dump_vios);
+data_vio_count_t get_data_vio_pool_active_discards(struct data_vio_pool *pool);
+data_vio_count_t get_data_vio_pool_discard_limit(struct data_vio_pool *pool);
+data_vio_count_t get_data_vio_pool_maximum_discards(struct data_vio_pool *pool);
+int __must_check
+set_data_vio_pool_discard_limit(struct data_vio_pool *pool, data_vio_count_t limit);
+data_vio_count_t get_data_vio_pool_active_requests(struct data_vio_pool *pool);
+data_vio_count_t get_data_vio_pool_request_limit(struct data_vio_pool *pool);
+data_vio_count_t get_data_vio_pool_maximum_requests(struct data_vio_pool *pool);
+
+void complete_data_vio(struct vdo_completion *completion);
+void handle_data_vio_error(struct vdo_completion *completion);
+
+static inline void continue_data_vio(struct data_vio *data_vio)
+{
+	vdo_launch_completion(&data_vio->vio.completion);
+}
+
+/**
+ * continue_data_vio_with_error() - Set an error code and then continue processing a data_vio.
+ *
+ * This will not mask older errors. This function can be called with a success code, but it is more
+ * efficient to call continue_data_vio() if the caller knows the result was a success.
+ */
+static inline void continue_data_vio_with_error(struct data_vio *data_vio, int result)
+{
+	vdo_continue_completion(&data_vio->vio.completion, result);
+}
+
+const char * __must_check get_data_vio_operation_name(struct data_vio *data_vio);
+
+static inline void assert_data_vio_in_hash_zone(struct data_vio *data_vio)
+{
+	thread_id_t expected = data_vio->hash_zone->thread_id;
+	thread_id_t thread_id = vdo_get_callback_thread_id();
+	/*
+	 * It's odd to use the LBN, but converting the record name to hex is a bit clunky for an
+	 * inline, and the LBN better than nothing as an identifier.
+	 */
+	ASSERT_LOG_ONLY((expected == thread_id),
+			"data_vio for logical block %llu on thread %u, should be on hash zone thread %u",
+			(unsigned long long) data_vio->logical.lbn,
+			thread_id,
+			expected);
+}
+
+static inline void set_data_vio_hash_zone_callback(struct data_vio *data_vio, vdo_action *callback)
+{
+	vdo_set_completion_callback(&data_vio->vio.completion,
+				    callback,
+				    data_vio->hash_zone->thread_id);
+}
+
+/**
+ * launch_data_vio_hash_zone_callback() - Set a callback as a hash zone operation and invoke it
+ *					  immediately.
+ */
+static inline void
+launch_data_vio_hash_zone_callback(struct data_vio *data_vio, vdo_action *callback)
+{
+	set_data_vio_hash_zone_callback(data_vio, callback);
+	vdo_launch_completion(&data_vio->vio.completion);
+}
+
+static inline void assert_data_vio_in_logical_zone(struct data_vio *data_vio)
+{
+	thread_id_t expected = data_vio->logical.zone->thread_id;
+	thread_id_t thread_id = vdo_get_callback_thread_id();
+
+	ASSERT_LOG_ONLY((expected == thread_id),
+			"data_vio for logical block %llu on thread %u, should be on thread %u",
+			(unsigned long long) data_vio->logical.lbn,
+			thread_id,
+			expected);
+}
+
+static inline void set_data_vio_logical_callback(struct data_vio *data_vio, vdo_action *callback)
+{
+	vdo_set_completion_callback(&data_vio->vio.completion,
+				    callback,
+				    data_vio->logical.zone->thread_id);
+}
+
+/**
+ * launch_data_vio_logical_callback() - Set a callback as a logical block operation and invoke it
+ *					immediately.
+ */
+static inline void
+launch_data_vio_logical_callback(struct data_vio *data_vio, vdo_action *callback)
+{
+	set_data_vio_logical_callback(data_vio, callback);
+	vdo_launch_completion(&data_vio->vio.completion);
+}
+
+static inline void assert_data_vio_in_allocated_zone(struct data_vio *data_vio)
+{
+	thread_id_t expected = data_vio->allocation.zone->thread_id;
+	thread_id_t thread_id = vdo_get_callback_thread_id();
+
+	ASSERT_LOG_ONLY((expected == thread_id),
+			"struct data_vio for allocated physical block %llu on thread %u, should be on thread %u",
+			(unsigned long long) data_vio->allocation.pbn,
+			thread_id,
+			expected);
+}
+
+static inline void
+set_data_vio_allocated_zone_callback(struct data_vio *data_vio, vdo_action *callback)
+{
+	vdo_set_completion_callback(&data_vio->vio.completion,
+				    callback,
+				    data_vio->allocation.zone->thread_id);
+}
+
+/**
+ * launch_data_vio_allocated_zone_callback() - Set a callback as a physical block operation in a
+ *					       data_vio's allocated zone and queue the data_vio and
+ *					       invoke it immediately.
+ */
+static inline void
+launch_data_vio_allocated_zone_callback(struct data_vio *data_vio, vdo_action *callback)
+{
+	set_data_vio_allocated_zone_callback(data_vio, callback);
+	vdo_launch_completion(&data_vio->vio.completion);
+}
+
+static inline void assert_data_vio_in_duplicate_zone(struct data_vio *data_vio)
+{
+	thread_id_t expected = data_vio->duplicate.zone->thread_id;
+	thread_id_t thread_id = vdo_get_callback_thread_id();
+
+	ASSERT_LOG_ONLY((expected == thread_id),
+			"data_vio for duplicate physical block %llu on thread %u, should be on thread %u",
+			(unsigned long long) data_vio->duplicate.pbn,
+			thread_id,
+			expected);
+}
+
+static inline void
+set_data_vio_duplicate_zone_callback(struct data_vio *data_vio, vdo_action *callback)
+{
+	vdo_set_completion_callback(&data_vio->vio.completion,
+				    callback,
+				    data_vio->duplicate.zone->thread_id);
+}
+
+/**
+ * launch_data_vio_duplicate_zone_callback() - Set a callback as a physical block operation in a
+ *					       data_vio's duplicate zone and queue the data_vio and
+ *					       invoke it immediately.
+ */
+static inline void
+launch_data_vio_duplicate_zone_callback(struct data_vio *data_vio, vdo_action *callback)
+{
+	set_data_vio_duplicate_zone_callback(data_vio, callback);
+	vdo_launch_completion(&data_vio->vio.completion);
+}
+
+static inline void assert_data_vio_in_mapped_zone(struct data_vio *data_vio)
+{
+	thread_id_t expected = data_vio->mapped.zone->thread_id;
+	thread_id_t thread_id = vdo_get_callback_thread_id();
+
+	ASSERT_LOG_ONLY((expected == thread_id),
+			"data_vio for mapped physical block %llu on thread %u, should be on thread %u",
+			(unsigned long long) data_vio->mapped.pbn,
+			thread_id,
+			expected);
+}
+
+static inline void
+set_data_vio_mapped_zone_callback(struct data_vio *data_vio, vdo_action *callback)
+{
+	vdo_set_completion_callback(&data_vio->vio.completion,
+				    callback,
+				    data_vio->mapped.zone->thread_id);
+}
+
+static inline void assert_data_vio_in_new_mapped_zone(struct data_vio *data_vio)
+{
+	thread_id_t expected = data_vio->new_mapped.zone->thread_id;
+	thread_id_t thread_id = vdo_get_callback_thread_id();
+
+	ASSERT_LOG_ONLY((expected == thread_id),
+			"data_vio for new_mapped physical block %llu on thread %u, should be on thread %u",
+			(unsigned long long) data_vio->new_mapped.pbn,
+			thread_id,
+			expected);
+}
+
+static inline void
+set_data_vio_new_mapped_zone_callback(struct data_vio *data_vio, vdo_action *callback)
+{
+	vdo_set_completion_callback(&data_vio->vio.completion,
+				    callback,
+				    data_vio->new_mapped.zone->thread_id);
+}
+
+static inline void assert_data_vio_in_journal_zone(struct data_vio *data_vio)
+{
+	thread_id_t journal_thread = vdo_from_data_vio(data_vio)->thread_config.journal_thread;
+	thread_id_t thread_id = vdo_get_callback_thread_id();
+
+	ASSERT_LOG_ONLY((journal_thread == thread_id),
+			"data_vio for logical block %llu on thread %u, should be on journal thread %u",
+			(unsigned long long) data_vio->logical.lbn,
+			thread_id,
+			journal_thread);
+}
+
+static inline void set_data_vio_journal_callback(struct data_vio *data_vio, vdo_action *callback)
+{
+	thread_id_t journal_thread = vdo_from_data_vio(data_vio)->thread_config.journal_thread;
+
+	vdo_set_completion_callback(&data_vio->vio.completion, callback, journal_thread);
+}
+
+/**
+ * launch_data_vio_journal_callback() - Set a callback as a journal operation and invoke it
+ *					immediately.
+ */
+static inline void
+launch_data_vio_journal_callback(struct data_vio *data_vio, vdo_action *callback)
+{
+	set_data_vio_journal_callback(data_vio, callback);
+	vdo_launch_completion(&data_vio->vio.completion);
+}
+
+static inline void assert_data_vio_in_packer_zone(struct data_vio *data_vio)
+{
+	thread_id_t packer_thread = vdo_from_data_vio(data_vio)->thread_config.packer_thread;
+	thread_id_t thread_id = vdo_get_callback_thread_id();
+
+	ASSERT_LOG_ONLY((packer_thread == thread_id),
+			"data_vio for logical block %llu on thread %u, should be on packer thread %u",
+			(unsigned long long) data_vio->logical.lbn,
+			thread_id,
+			packer_thread);
+}
+
+static inline void set_data_vio_packer_callback(struct data_vio *data_vio, vdo_action *callback)
+{
+	thread_id_t packer_thread = vdo_from_data_vio(data_vio)->thread_config.packer_thread;
+
+	vdo_set_completion_callback(&data_vio->vio.completion, callback, packer_thread);
+}
+
+/**
+ * launch_data_vio_packer_callback() - Set a callback as a packer operation and invoke it
+ *				       immediately.
+ */
+static inline void launch_data_vio_packer_callback(struct data_vio *data_vio, vdo_action *callback)
+{
+	set_data_vio_packer_callback(data_vio, callback);
+	vdo_launch_completion(&data_vio->vio.completion);
+}
+
+static inline void assert_data_vio_on_cpu_thread(struct data_vio *data_vio)
+{
+	thread_id_t cpu_thread = vdo_from_data_vio(data_vio)->thread_config.cpu_thread;
+	thread_id_t thread_id = vdo_get_callback_thread_id();
+
+	ASSERT_LOG_ONLY((cpu_thread == thread_id),
+			"data_vio for logical block %llu on thread %u, should be on cpu thread %u",
+			(unsigned long long) data_vio->logical.lbn,
+			thread_id,
+			cpu_thread);
+}
+
+static inline void set_data_vio_cpu_callback(struct data_vio *data_vio, vdo_action *callback)
+{
+	thread_id_t cpu_thread = vdo_from_data_vio(data_vio)->thread_config.cpu_thread;
+
+	vdo_set_completion_callback(&data_vio->vio.completion, callback, cpu_thread);
+}
+
+/**
+ * launch_data_vio_cpu_callback() - Set a callback to run on the CPU queues and invoke it
+ *				    immediately.
+ */
+static inline void
+launch_data_vio_cpu_callback(struct data_vio *data_vio,
+			     vdo_action *callback,
+			     enum vdo_completion_priority priority)
+{
+	set_data_vio_cpu_callback(data_vio, callback);
+	vdo_launch_completion_with_priority(&data_vio->vio.completion, priority);
+}
+
+static inline void set_data_vio_bio_zone_callback(struct data_vio *data_vio, vdo_action *callback)
+{
+	vdo_set_completion_callback(&data_vio->vio.completion,
+				    callback,
+				    get_vio_bio_zone_thread_id(&data_vio->vio));
+}
+
+/**
+ * launch_data_vio_bio_zone_callback() - Set a callback as a bio zone operation and invoke it
+ *					 immediately.
+ */
+static inline void
+launch_data_vio_bio_zone_callback(struct data_vio *data_vio, vdo_action *callback)
+{
+	set_data_vio_bio_zone_callback(data_vio, callback);
+	vdo_launch_completion_with_priority(&data_vio->vio.completion, BIO_Q_DATA_PRIORITY);
+}
+
+/**
+ * launch_data_vio_on_bio_ack_queue() - If the vdo uses a bio_ack queue, set a callback to run on
+ *					it and invoke it immediately, otherwise, just run the
+ *					callback on the current thread.
+ */
+static inline void
+launch_data_vio_on_bio_ack_queue(struct data_vio *data_vio, vdo_action *callback)
+{
+	struct vdo_completion *completion = &data_vio->vio.completion;
+	struct vdo *vdo = completion->vdo;
+
+	if (!vdo_uses_bio_ack_queue(vdo)) {
+		callback(completion);
+		return;
+	}
+
+	vdo_set_completion_callback(completion, callback, vdo->thread_config.bio_ack_thread);
+	vdo_launch_completion_with_priority(completion, BIO_ACK_Q_ACK_PRIORITY);
+}
+
+void data_vio_allocate_data_block(struct data_vio *data_vio,
+				  enum pbn_lock_type write_lock_type,
+				  vdo_action *callback,
+				  vdo_action *error_handler);
+
+/**
+ * release_data_vio_allocation_lock() - Release the PBN lock on a data_vio's allocated block.
+ * @reset: If true, the allocation will be reset (i.e. any allocated pbn will be forgotten).
+ *
+ * If the reference to the locked block is still provisional, it will be released as well.
+ */
+void release_data_vio_allocation_lock(struct data_vio *data_vio, bool reset);
+
+int __must_check uncompress_data_vio(struct data_vio *data_vio,
+				     enum block_mapping_state mapping_state,
+				     char *buffer);
+
+void update_metadata_for_data_vio_write(struct data_vio *data_vio, struct pbn_lock *lock);
+void write_data_vio(struct data_vio *data_vio);
+void launch_compress_data_vio(struct data_vio *data_vio);
+void continue_data_vio_with_block_map_slot(struct vdo_completion *completion);
+
+#endif /* DATA_VIO_H */
diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c
new file mode 100644
index 00000000000..d71f93caf98
--- /dev/null
+++ b/drivers/md/dm-vdo/dedupe.c
@@ -0,0 +1,3073 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+/**
+ * DOC:
+ *
+ * Hash Locks:
+ *
+ * A hash_lock controls and coordinates writing, index access, and dedupe among groups of data_vios
+ * concurrently writing identical blocks, allowing them to deduplicate not only against advice but
+ * also against each other. This saves on index queries and allows those data_vios to concurrently
+ * deduplicate against a single block instead of being serialized through a PBN read lock. Only one
+ * index query is needed for each hash_lock, instead of one for every data_vio.
+ *
+ * A hash_lock acts like a state machine perhaps more than as a lock. Other than the starting and
+ * ending states INITIALIZING and BYPASSING, every state represents and is held for the duration of
+ * an asynchronous operation. All state transitions are performed on the thread of the hash_zone
+ * containing the lock. An asynchronous operation is almost always performed upon entering a state,
+ * and the callback from that operation triggers exiting the state and entering a new state.
+ *
+ * In all states except DEDUPING, there is a single data_vio, called the lock agent, performing the
+ * asynchronous operations on behalf of the lock. The agent will change during the lifetime of the
+ * lock if the lock is shared by more than one data_vio. data_vios waiting to deduplicate are kept
+ * on a wait queue. Viewed a different way, the agent holds the lock exclusively until the lock
+ * enters the DEDUPING state, at which point it becomes a shared lock that all the waiters (and any
+ * new data_vios that arrive) use to share a PBN lock. In state DEDUPING, there is no agent. When
+ * the last data_vio in the lock calls back in DEDUPING, it becomes the agent and the lock becomes
+ * exclusive again. New data_vios that arrive in the lock will also go on the wait queue.
+ *
+ * The existence of lock waiters is a key factor controlling which state the lock transitions to
+ * next. When the lock is new or has waiters, it will always try to reach DEDUPING, and when it
+ * doesn't, it will try to clean up and exit.
+ *
+ * Deduping requires holding a PBN lock on a block that is known to contain data identical to the
+ * data_vios in the lock, so the lock will send the agent to the duplicate zone to acquire the PBN
+ * lock (LOCKING), to the kernel I/O threads to read and verify the data (VERIFYING), or to write a
+ * new copy of the data to a full data block or a slot in a compressed block (WRITING).
+ *
+ * Cleaning up consists of updating the index when the data location is different from the initial
+ * index query (UPDATING, triggered by stale advice, compression, and rollover), releasing the PBN
+ * lock on the duplicate block (UNLOCKING), and if the agent is the last data_vio referencing the
+ * lock, releasing the hash_lock itself back to the hash zone (BYPASSING).
+ *
+ * The shortest sequence of states is for non-concurrent writes of new data:
+ *   INITIALIZING -> QUERYING -> WRITING -> BYPASSING
+ * This sequence is short because no PBN read lock or index update is needed.
+ *
+ * Non-concurrent, finding valid advice looks like this (endpoints elided):
+ *   -> QUERYING -> LOCKING -> VERIFYING -> DEDUPING -> UNLOCKING ->
+ * Or with stale advice (endpoints elided):
+ *   -> QUERYING -> LOCKING -> VERIFYING -> UNLOCKING -> WRITING -> UPDATING ->
+ *
+ * When there are not enough available reference count increments available on a PBN for a data_vio
+ * to deduplicate, a new lock is forked and the excess waiters roll over to the new lock (which
+ * goes directly to WRITING). The new lock takes the place of the old lock in the lock map so new
+ * data_vios will be directed to it. The two locks will proceed independently, but only the new
+ * lock will have the right to update the index (unless it also forks).
+ *
+ * Since rollover happens in a lock instance, once a valid data location has been selected, it will
+ * not change. QUERYING and WRITING are only performed once per lock lifetime. All other
+ * non-endpoint states can be re-entered.
+ *
+ * The function names in this module follow a convention referencing the states and transitions in
+ * the state machine. For example, for the LOCKING state, there are start_locking() and
+ * finish_locking() functions.  start_locking() is invoked by the finish function of the state (or
+ * states) that transition to LOCKING. It performs the actual lock state change and must be invoked
+ * on the hash zone thread.  finish_locking() is called by (or continued via callback from) the
+ * code actually obtaining the lock. It does any bookkeeping or decision-making required and
+ * invokes the appropriate start function of the state being transitioned to after LOCKING.
+ *
+ * ----------------------------------------------------------------------
+ *
+ * Index Queries:
+ *
+ * A query to the UDS index is handled asynchronously by the index's threads. When the query is
+ * complete, a callback supplied with the query will be called from one of the those threads. Under
+ * heavy system load, the index may be slower to respond then is desirable for reasonable I/O
+ * throughput. Since deduplication of writes is not necessary for correct operation of a VDO
+ * device, it is acceptable to timeout out slow index queries and proceed to fulfill a write
+ * request without deduplicating. However, because the uds_request struct itself is supplied by the
+ * caller, we can not simply reuse a uds_request object which we have chosen to timeout. Hence,
+ * each hash_zone maintains a pool of dedupe_contexts which each contain a uds_request along with a
+ * reference to the data_vio on behalf of which they are performing a query.
+ *
+ * When a hash_lock needs to query the index, it attempts to acquire an unused dedupe_context from
+ * its hash_zone's pool. If one is available, that context is prepared, associated with the
+ * hash_lock's agent, added to the list of pending contexts, and then sent to the index. The
+ * context's state will be transitioned from DEDUPE_CONTEXT_IDLE to DEDUPE_CONTEXT_PENDING. If all
+ * goes well, the dedupe callback will be called by the index which will change the context's state
+ * to DEDUPE_CONTEXT_COMPLETE, and the associated data_vio will be enqueued to run back in the hash
+ * zone where the query results will be processed and the context will be put back in the idle
+ * state and returned to the hash_zone's available list.
+ *
+ * The first time an index query is launched from a given hash_zone, a timer is started. When the
+ * timer fires, the hash_zone's completion is enqueued to run in the hash_zone where the zone's
+ * pending list will be searched for any contexts in the pending state which have been running for
+ * too long. Those contexts are transitioned to the DEDUPE_CONTEXT_TIMED_OUT state and moved to the
+ * zone's timed_out list where they won't be examined again if there is a subsequent time out). The
+ * data_vios associated with timed out contexts are sent to continue processing their write
+ * operation without deduplicating. The timer is also restarted.
+ *
+ * When the dedupe callback is run for a context which is in the timed out state, that context is
+ * moved to the DEDUPE_CONTEXT_TIMED_OUT_COMPLETE state. No other action need be taken as the
+ * associated data_vios have already been dispatched.
+ *
+ * If a hash_lock needs a dedupe context, and the available list is empty, the timed_out list will
+ * be searched for any contexts which are timed out and complete. One of these will be used
+ * immediately, and the rest will be returned to the available list and marked idle.
+ */
+
+#include "dedupe.h"
+
+#include <linux/atomic.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/ratelimit.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+#include "string-utils.h"
+#include "uds.h"
+
+#include "action-manager.h"
+#include "admin-state.h"
+#include "completion.h"
+#include "constants.h"
+#include "data-vio.h"
+#include "io-submitter.h"
+#include "packer.h"
+#include "physical-zone.h"
+#include "pointer-map.h"
+#include "slab-depot.h"
+#include "statistics.h"
+#include "types.h"
+#include "vdo.h"
+#include "wait-queue.h"
+
+struct uds_attribute {
+	struct attribute attr;
+	const char *(*show_string)(struct hash_zones *hash_zones);
+};
+
+enum timer_state {
+	DEDUPE_QUERY_TIMER_IDLE,
+	DEDUPE_QUERY_TIMER_RUNNING,
+	DEDUPE_QUERY_TIMER_FIRED,
+};
+
+enum dedupe_context_state {
+	DEDUPE_CONTEXT_IDLE,
+	DEDUPE_CONTEXT_PENDING,
+	DEDUPE_CONTEXT_TIMED_OUT,
+	DEDUPE_CONTEXT_COMPLETE,
+	DEDUPE_CONTEXT_TIMED_OUT_COMPLETE,
+};
+
+/* Possible index states: closed, opened, or transitioning between those two. */
+enum index_state {
+	IS_CLOSED,
+	IS_CHANGING,
+	IS_OPENED,
+};
+
+static const char *CLOSED = "closed";
+static const char *CLOSING = "closing";
+static const char *ERROR = "error";
+static const char *OFFLINE = "offline";
+static const char *ONLINE = "online";
+static const char *OPENING = "opening";
+static const char *SUSPENDED = "suspended";
+static const char *UNKNOWN = "unknown";
+
+/* Version 2 uses the kernel space UDS index and is limited to 16 bytes */
+enum {
+	UDS_ADVICE_VERSION = 2,
+	/* version byte + state byte + 64-bit little-endian PBN */
+	UDS_ADVICE_SIZE = 1 + 1 + sizeof(u64),
+};
+
+enum hash_lock_state {
+	/* State for locks that are not in use or are being initialized. */
+	VDO_HASH_LOCK_INITIALIZING,
+
+	/* This is the sequence of states typically used on the non-dedupe path. */
+	VDO_HASH_LOCK_QUERYING,
+	VDO_HASH_LOCK_WRITING,
+	VDO_HASH_LOCK_UPDATING,
+
+	/* The remaining states are typically used on the dedupe path in this order. */
+	VDO_HASH_LOCK_LOCKING,
+	VDO_HASH_LOCK_VERIFYING,
+	VDO_HASH_LOCK_DEDUPING,
+	VDO_HASH_LOCK_UNLOCKING,
+
+	/*
+	 * Terminal state for locks returning to the pool. Must be last both because it's the final
+	 * state, and also because it's used to count the states.
+	 */
+	VDO_HASH_LOCK_BYPASSING,
+};
+
+static const char * const LOCK_STATE_NAMES[] = {
+	[VDO_HASH_LOCK_BYPASSING] = "BYPASSING",
+	[VDO_HASH_LOCK_DEDUPING] = "DEDUPING",
+	[VDO_HASH_LOCK_INITIALIZING] = "INITIALIZING",
+	[VDO_HASH_LOCK_LOCKING] = "LOCKING",
+	[VDO_HASH_LOCK_QUERYING] = "QUERYING",
+	[VDO_HASH_LOCK_UNLOCKING] = "UNLOCKING",
+	[VDO_HASH_LOCK_UPDATING] = "UPDATING",
+	[VDO_HASH_LOCK_VERIFYING] = "VERIFYING",
+	[VDO_HASH_LOCK_WRITING] = "WRITING",
+};
+
+struct hash_lock {
+	/* The block hash covered by this lock */
+	struct uds_record_name hash;
+
+	/* When the lock is unused, this list entry allows the lock to be pooled */
+	struct list_head pool_node;
+
+	/*
+	 * A list containing the data VIOs sharing this lock, all having the same record name and
+	 * data block contents, linked by their hash_lock_node fields.
+	 */
+	struct list_head duplicate_ring;
+
+	/* The number of data_vios sharing this lock instance */
+	data_vio_count_t reference_count;
+
+	/* The maximum value of reference_count in the lifetime of this lock */
+	data_vio_count_t max_references;
+
+	/* The current state of this lock */
+	enum hash_lock_state state;
+
+	/* True if the UDS index should be updated with new advice */
+	bool update_advice;
+
+	/* True if the advice has been verified to be a true duplicate */
+	bool verified;
+
+	/* True if the lock has already accounted for an initial verification */
+	bool verify_counted;
+
+	/* True if this lock is registered in the lock map (cleared on rollover) */
+	bool registered;
+
+	/*
+	 * If verified is false, this is the location of a possible duplicate. If verified is true,
+	 * it is the verified location of a true duplicate.
+	 */
+	struct zoned_pbn duplicate;
+
+	/* The PBN lock on the block containing the duplicate data */
+	struct pbn_lock *duplicate_lock;
+
+	/* The data_vio designated to act on behalf of the lock */
+	struct data_vio *agent;
+
+	/*
+	 * Other data_vios with data identical to the agent who are currently waiting for the agent
+	 * to get the information they all need to deduplicate--either against each other, or
+	 * against an existing duplicate on disk.
+	 */
+	struct wait_queue waiters;
+};
+
+enum {
+	LOCK_POOL_CAPACITY = MAXIMUM_VDO_USER_VIOS,
+};
+
+struct hash_zones {
+	struct action_manager *manager;
+	struct kobject dedupe_directory;
+	struct uds_parameters parameters;
+	struct uds_index_session *index_session;
+	struct ratelimit_state ratelimiter;
+	atomic64_t timeouts;
+	atomic64_t dedupe_context_busy;
+
+	/* This spinlock protects the state fields and the starting of dedupe requests. */
+	spinlock_t lock;
+
+	/* The fields in the next block are all protected by the lock */
+	struct vdo_completion completion;
+	enum index_state index_state;
+	enum index_state index_target;
+	struct admin_state state;
+	bool changing;
+	bool create_flag;
+	bool dedupe_flag;
+	bool error_flag;
+	u64 reported_timeouts;
+
+	/* The number of zones */
+	zone_count_t zone_count;
+	/* The hash zones themselves */
+	struct hash_zone zones[];
+};
+
+/* These are in milliseconds. */
+unsigned int vdo_dedupe_index_timeout_interval = 5000;
+unsigned int vdo_dedupe_index_min_timer_interval = 100;
+/* Same two variables, in jiffies for easier consumption. */
+static u64 vdo_dedupe_index_timeout_jiffies;
+static u64 vdo_dedupe_index_min_timer_jiffies;
+
+static inline struct hash_zone *as_hash_zone(struct vdo_completion *completion)
+{
+	vdo_assert_completion_type(completion, VDO_HASH_ZONE_COMPLETION);
+	return container_of(completion, struct hash_zone, completion);
+}
+
+static inline struct hash_zones *as_hash_zones(struct vdo_completion *completion)
+{
+	vdo_assert_completion_type(completion, VDO_HASH_ZONES_COMPLETION);
+	return container_of(completion, struct hash_zones, completion);
+}
+
+static inline void assert_in_hash_zone(struct hash_zone *zone, const char *name)
+{
+	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == zone->thread_id),
+			"%s called on hash zone thread",
+			name);
+}
+
+static inline bool change_context_state(struct dedupe_context *context, int old, int new)
+{
+	return (atomic_cmpxchg(&context->state, old, new) == old);
+}
+
+static inline bool change_timer_state(struct hash_zone *zone, int old, int new)
+{
+	return (atomic_cmpxchg(&zone->timer_state, old, new) == old);
+}
+
+/**
+ * return_hash_lock_to_pool() - (Re)initialize a hash lock and return it to its pool.
+ * @zone: The zone from which the lock was borrowed.
+ * @lock: The lock that is no longer in use.
+ */
+static void return_hash_lock_to_pool(struct hash_zone *zone, struct hash_lock *lock)
+{
+	memset(lock, 0, sizeof(*lock));
+	INIT_LIST_HEAD(&lock->pool_node);
+	INIT_LIST_HEAD(&lock->duplicate_ring);
+	vdo_initialize_wait_queue(&lock->waiters);
+	list_add_tail(&lock->pool_node, &zone->lock_pool);
+}
+
+/**
+ * vdo_get_duplicate_lock() - Get the PBN lock on the duplicate data location for a data_vio from
+ *                            the hash_lock the data_vio holds (if there is one).
+ * @data_vio: The data_vio to query.
+ *
+ * Return: The PBN lock on the data_vio's duplicate location.
+ */
+struct pbn_lock *vdo_get_duplicate_lock(struct data_vio *data_vio)
+{
+	if (data_vio->hash_lock == NULL)
+		return NULL;
+	return data_vio->hash_lock->duplicate_lock;
+}
+
+/**
+ * get_hash_lock_state_name() - Get the string representation of a hash lock state.
+ * @state: The hash lock state.
+ *
+ * Return: The short string representing the state
+ */
+static const char *get_hash_lock_state_name(enum hash_lock_state state)
+{
+	/* Catch if a state has been added without updating the name array. */
+	STATIC_ASSERT((VDO_HASH_LOCK_BYPASSING + 1) == ARRAY_SIZE(LOCK_STATE_NAMES));
+	return (state < ARRAY_SIZE(LOCK_STATE_NAMES)) ? LOCK_STATE_NAMES[state] : "INVALID";
+}
+
+/**
+ * assert_hash_lock_agent() - Assert that a data_vio is the agent of its hash lock, and that this
+ *                            is being called in the hash zone.
+ * @data_vio: The data_vio expected to be the lock agent.
+ * @where: A string describing the function making the assertion.
+ */
+static void assert_hash_lock_agent(struct data_vio *data_vio, const char *where)
+{
+	/* Not safe to access the agent field except from the hash zone. */
+	assert_data_vio_in_hash_zone(data_vio);
+	ASSERT_LOG_ONLY(data_vio == data_vio->hash_lock->agent,
+			"%s must be for the hash lock agent", where);
+}
+
+/**
+ * set_duplicate_lock() - Set the duplicate lock held by a hash lock. May only be called in the
+ *                        physical zone of the PBN lock.
+ * @hash_lock: The hash lock to update.
+ * @pbn_lock: The PBN read lock to use as the duplicate lock.
+ */
+static void set_duplicate_lock(struct hash_lock *hash_lock, struct pbn_lock *pbn_lock)
+{
+	ASSERT_LOG_ONLY((hash_lock->duplicate_lock == NULL),
+			"hash lock must not already hold a duplicate lock");
+
+	pbn_lock->holder_count += 1;
+	hash_lock->duplicate_lock = pbn_lock;
+}
+
+/**
+ * dequeue_lock_waiter() - Remove the first data_vio from the lock's wait queue and return it.
+ * @lock: The lock containing the wait queue.
+ *
+ * Return: The first (oldest) waiter in the queue, or NULL if the queue is empty.
+ */
+static inline struct data_vio *dequeue_lock_waiter(struct hash_lock *lock)
+{
+	return waiter_as_data_vio(vdo_dequeue_next_waiter(&lock->waiters));
+}
+
+/**
+ * set_hash_lock() - Set, change, or clear the hash lock a data_vio is using.
+ * @data_vio: The data_vio to update.
+ * @new_lock: The hash lock the data_vio is joining.
+ *
+ * Updates the hash lock (or locks) to reflect the change in membership.
+ */
+static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock)
+{
+	struct hash_lock *old_lock = data_vio->hash_lock;
+
+	if (old_lock != NULL) {
+		ASSERT_LOG_ONLY(data_vio->hash_zone != NULL,
+				"must have a hash zone when holding a hash lock");
+		ASSERT_LOG_ONLY(!list_empty(&data_vio->hash_lock_entry),
+				"must be on a hash lock ring when holding a hash lock");
+		ASSERT_LOG_ONLY(old_lock->reference_count > 0,
+				"hash lock reference must be counted");
+
+		if ((old_lock->state != VDO_HASH_LOCK_BYPASSING) &&
+		    (old_lock->state != VDO_HASH_LOCK_UNLOCKING))
+			/*
+			 * If the reference count goes to zero in a non-terminal state, we're most
+			 * likely leaking this lock.
+			 */
+			ASSERT_LOG_ONLY(old_lock->reference_count > 1,
+					"hash locks should only become unreferenced in a terminal state, not state %s",
+					get_hash_lock_state_name(old_lock->state));
+
+		list_del_init(&data_vio->hash_lock_entry);
+		old_lock->reference_count -= 1;
+
+		data_vio->hash_lock = NULL;
+	}
+
+	if (new_lock != NULL) {
+		/*
+		 * Keep all data_vios sharing the lock on a ring since they can complete in any
+		 * order and we'll always need a pointer to one to compare data.
+		 */
+		list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_ring);
+		new_lock->reference_count += 1;
+		if (new_lock->max_references < new_lock->reference_count)
+			new_lock->max_references = new_lock->reference_count;
+
+		data_vio->hash_lock = new_lock;
+	}
+}
+
+/* There are loops in the state diagram, so some forward decl's are needed. */
+static void start_deduping(struct hash_lock *lock, struct data_vio *agent, bool agent_is_done);
+static void start_locking(struct hash_lock *lock, struct data_vio *agent);
+static void start_writing(struct hash_lock *lock, struct data_vio *agent);
+static void unlock_duplicate_pbn(struct vdo_completion *completion);
+static void transfer_allocation_lock(struct data_vio *data_vio);
+
+/**
+ * exit_hash_lock() - Bottleneck for data_vios that have written or deduplicated and that are no
+ *                    longer needed to be an agent for the hash lock.
+ * @data_vio: The data_vio to complete and send to be cleaned up.
+ */
+static void exit_hash_lock(struct data_vio *data_vio)
+{
+	/* Release the hash lock now, saving a thread transition in cleanup. */
+	vdo_release_hash_lock(data_vio);
+
+	/* Complete the data_vio and start the clean-up path to release any locks it still holds. */
+	data_vio->vio.completion.callback = complete_data_vio;
+
+	continue_data_vio(data_vio);
+}
+
+/**
+ * set_duplicate_location() - Set the location of the duplicate block for data_vio, updating the
+ *                            is_duplicate and duplicate fields from a zoned_pbn.
+ * @data_vio: The data_vio to modify.
+ * @source: The location of the duplicate.
+ */
+static void set_duplicate_location(struct data_vio *data_vio, const struct zoned_pbn source)
+{
+	data_vio->is_duplicate = (source.pbn != VDO_ZERO_BLOCK);
+	data_vio->duplicate = source;
+}
+
+/**
+ * retire_lock_agent() - Retire the active lock agent, replacing it with the first lock waiter, and
+ *                       make the retired agent exit the hash lock.
+ * @lock: The hash lock to update.
+ *
+ * Return: The new lock agent (which will be NULL if there was no waiter)
+ */
+static struct data_vio *retire_lock_agent(struct hash_lock *lock)
+{
+	struct data_vio *old_agent = lock->agent;
+	struct data_vio *new_agent = dequeue_lock_waiter(lock);
+
+	lock->agent = new_agent;
+	exit_hash_lock(old_agent);
+	if (new_agent != NULL)
+		set_duplicate_location(new_agent, lock->duplicate);
+	return new_agent;
+}
+
+/**
+ * wait_on_hash_lock() - Add a data_vio to the lock's queue of waiters.
+ * @lock: The hash lock on which to wait.
+ * @data_vio: The data_vio to add to the queue.
+ */
+static void wait_on_hash_lock(struct hash_lock *lock, struct data_vio *data_vio)
+{
+	vdo_enqueue_waiter(&lock->waiters, &data_vio->waiter);
+
+	/*
+	 * Make sure the agent doesn't block indefinitely in the packer since it now has at least
+	 * one other data_vio waiting on it.
+	 */
+	if ((lock->state != VDO_HASH_LOCK_WRITING) || !cancel_data_vio_compression(lock->agent))
+		return;
+
+	/*
+	 * Even though we're waiting, we also have to send ourselves as a one-way message to the
+	 * packer to ensure the agent continues executing. This is safe because
+	 * cancel_vio_compression() guarantees the agent won't continue executing until this
+	 * message arrives in the packer, and because the wait queue link isn't used for sending
+	 * the message.
+	 */
+	data_vio->compression.lock_holder = lock->agent;
+	launch_data_vio_packer_callback(data_vio, vdo_remove_lock_holder_from_packer);
+}
+
+/**
+ * abort_waiter() - waiter_callback function that shunts waiters to write their blocks without
+ *                  optimization.
+ * @waiter: The data_vio's waiter link.
+ * @context: Not used.
+ */
+static void abort_waiter(struct waiter *waiter, void *context __always_unused)
+{
+	write_data_vio(waiter_as_data_vio(waiter));
+}
+
+/**
+ * start_bypassing() - Stop using the hash lock.
+ * @lock: The hash lock.
+ * @agent: The data_vio acting as the agent for the lock.
+ *
+ * Stops using the hash lock. This is the final transition for hash locks which did not get an
+ * error.
+ */
+static void start_bypassing(struct hash_lock *lock, struct data_vio *agent)
+{
+	lock->state = VDO_HASH_LOCK_BYPASSING;
+	exit_hash_lock(agent);
+}
+
+void vdo_clean_failed_hash_lock(struct data_vio *data_vio)
+{
+	struct hash_lock *lock = data_vio->hash_lock;
+
+	if (lock->state == VDO_HASH_LOCK_BYPASSING) {
+		exit_hash_lock(data_vio);
+		return;
+	}
+
+	if (lock->agent == NULL) {
+		lock->agent = data_vio;
+	} else if (data_vio != lock->agent) {
+		exit_hash_lock(data_vio);
+		return;
+	}
+
+	lock->state = VDO_HASH_LOCK_BYPASSING;
+
+	/* Ensure we don't attempt to update advice when cleaning up. */
+	lock->update_advice = false;
+
+	vdo_notify_all_waiters(&lock->waiters, abort_waiter, NULL);
+
+	if (lock->duplicate_lock != NULL) {
+		/* The agent must reference the duplicate zone to launch it. */
+		data_vio->duplicate = lock->duplicate;
+		launch_data_vio_duplicate_zone_callback(data_vio, unlock_duplicate_pbn);
+		return;
+	}
+
+	lock->agent = NULL;
+	data_vio->is_duplicate = false;
+	exit_hash_lock(data_vio);
+}
+
+/**
+ * finish_unlocking() - Handle the result of the agent for the lock releasing a read lock on
+ *                      duplicate candidate.
+ * @completion: The completion of the data_vio acting as the lock's agent.
+ *
+ * This continuation is registered in unlock_duplicate_pbn().
+ */
+static void finish_unlocking(struct vdo_completion *completion)
+{
+	struct data_vio *agent = as_data_vio(completion);
+	struct hash_lock *lock = agent->hash_lock;
+
+	assert_hash_lock_agent(agent, __func__);
+
+	ASSERT_LOG_ONLY(lock->duplicate_lock == NULL,
+			"must have released the duplicate lock for the hash lock");
+
+	if (!lock->verified) {
+		/*
+		 * UNLOCKING -> WRITING transition: The lock we released was on an unverified
+		 * block, so it must have been a lock on advice we were verifying, not on a
+		 * location that was used for deduplication. Go write (or compress) the block to
+		 * get a location to dedupe against.
+		 */
+		start_writing(lock, agent);
+		return;
+	}
+
+	/*
+	 * With the lock released, the verified duplicate block may already have changed and will
+	 * need to be re-verified if a waiter arrived.
+	 */
+	lock->verified = false;
+
+	if (vdo_has_waiters(&lock->waiters)) {
+		/*
+		 * UNLOCKING -> LOCKING transition: A new data_vio entered the hash lock while the
+		 * agent was releasing the PBN lock. The current agent exits and the waiter has to
+		 * re-lock and re-verify the duplicate location.
+		 *
+		 * TODO: If we used the current agent to re-acquire the PBN lock we wouldn't need
+		 * to re-verify.
+		 */
+		agent = retire_lock_agent(lock);
+		start_locking(lock, agent);
+		return;
+	}
+
+	/*
+	 * UNLOCKING -> BYPASSING transition: The agent is done with the lock and no other
+	 * data_vios reference it, so remove it from the lock map and return it to the pool.
+	 */
+	start_bypassing(lock, agent);
+}
+
+/**
+ * unlock_duplicate_pbn() - Release a read lock on the PBN of the block that may or may not have
+ *                          contained duplicate data.
+ * @completion: The completion of the data_vio acting as the lock's agent.
+ *
+ * This continuation is launched by start_unlocking(), and calls back to finish_unlocking() on the
+ * hash zone thread.
+ */
+static void unlock_duplicate_pbn(struct vdo_completion *completion)
+{
+	struct data_vio *agent = as_data_vio(completion);
+	struct hash_lock *lock = agent->hash_lock;
+
+	assert_data_vio_in_duplicate_zone(agent);
+	ASSERT_LOG_ONLY(lock->duplicate_lock != NULL, "must have a duplicate lock to release");
+
+	vdo_release_physical_zone_pbn_lock(agent->duplicate.zone,
+					   agent->duplicate.pbn,
+					   UDS_FORGET(lock->duplicate_lock));
+	if (lock->state == VDO_HASH_LOCK_BYPASSING) {
+		complete_data_vio(completion);
+		return;
+	}
+
+	launch_data_vio_hash_zone_callback(agent, finish_unlocking);
+}
+
+/**
+ * start_unlocking() - Release a read lock on the PBN of the block that may or may not have
+ *                     contained duplicate data.
+ * @lock: The hash lock.
+ * @agent: The data_vio currently acting as the agent for the lock.
+ */
+static void start_unlocking(struct hash_lock *lock, struct data_vio *agent)
+{
+	lock->state = VDO_HASH_LOCK_UNLOCKING;
+	launch_data_vio_duplicate_zone_callback(agent, unlock_duplicate_pbn);
+}
+
+static void release_context(struct dedupe_context *context)
+{
+	struct hash_zone *zone = context->zone;
+
+	WRITE_ONCE(zone->active, zone->active - 1);
+	list_move(&context->list_entry, &zone->available);
+}
+
+static void process_update_result(struct data_vio *agent)
+{
+	struct dedupe_context *context = agent->dedupe_context;
+
+	if ((context == NULL) ||
+	    !change_context_state(context, DEDUPE_CONTEXT_COMPLETE, DEDUPE_CONTEXT_IDLE))
+		return;
+
+	release_context(context);
+}
+
+/**
+ * finish_updating() - Process the result of a UDS update performed by the agent for the lock.
+ * @completion: The completion of the data_vio that performed the update
+ *
+ * This continuation is registered in start_querying().
+ */
+static void finish_updating(struct vdo_completion *completion)
+{
+	struct data_vio *agent = as_data_vio(completion);
+	struct hash_lock *lock = agent->hash_lock;
+
+	assert_hash_lock_agent(agent, __func__);
+
+	process_update_result(agent);
+
+	/*
+	 * UDS was updated successfully, so don't update again unless the duplicate location
+	 * changes due to rollover.
+	 */
+	lock->update_advice = false;
+
+	if (vdo_has_waiters(&lock->waiters)) {
+		/*
+		 * UPDATING -> DEDUPING transition: A new data_vio arrived during the UDS update.
+		 * Send it on the verified dedupe path. The agent is done with the lock, but the
+		 * lock may still need to use it to clean up after rollover.
+		 */
+		start_deduping(lock, agent, true);
+		return;
+	}
+
+	if (lock->duplicate_lock != NULL) {
+		/*
+		 * UPDATING -> UNLOCKING transition: No one is waiting to dedupe, but we hold a
+		 * duplicate PBN lock, so go release it.
+		 */
+		start_unlocking(lock, agent);
+		return;
+	}
+
+	/*
+	 * UPDATING -> BYPASSING transition: No one is waiting to dedupe and there's no lock to
+	 * release.
+	 */
+	start_bypassing(lock, agent);
+}
+
+static void query_index(struct data_vio *data_vio, enum uds_request_type operation);
+
+/**
+ * start_updating() - Continue deduplication with the last step, updating UDS with the location of
+ *                    the duplicate that should be returned as advice in the future.
+ * @lock: The hash lock.
+ * @agent: The data_vio currently acting as the agent for the lock.
+ */
+static void start_updating(struct hash_lock *lock, struct data_vio *agent)
+{
+	lock->state = VDO_HASH_LOCK_UPDATING;
+
+	ASSERT_LOG_ONLY(lock->verified, "new advice should have been verified");
+	ASSERT_LOG_ONLY(lock->update_advice, "should only update advice if needed");
+
+	agent->last_async_operation = VIO_ASYNC_OP_UPDATE_DEDUPE_INDEX;
+	set_data_vio_hash_zone_callback(agent, finish_updating);
+	query_index(agent, UDS_UPDATE);
+}
+
+/**
+ * finish_deduping() - Handle a data_vio that has finished deduplicating against the block locked
+ *                     by the hash lock.
+ * @lock: The hash lock.
+ * @data_vio: The lock holder that has finished deduplicating.
+ *
+ * If there are other data_vios still sharing the lock, this will just release the data_vio's share
+ * of the lock and finish processing the data_vio. If this is the last data_vio holding the lock,
+ * this makes the data_vio the lock agent and uses it to advance the state of the lock so it can
+ * eventually be released.
+ */
+static void finish_deduping(struct hash_lock *lock, struct data_vio *data_vio)
+{
+	struct data_vio *agent = data_vio;
+
+	ASSERT_LOG_ONLY(lock->agent == NULL, "shouldn't have an agent in DEDUPING");
+	ASSERT_LOG_ONLY(!vdo_has_waiters(&lock->waiters),
+			"shouldn't have any lock waiters in DEDUPING");
+
+	/* Just release the lock reference if other data_vios are still deduping. */
+	if (lock->reference_count > 1) {
+		exit_hash_lock(data_vio);
+		return;
+	}
+
+	/* The hash lock must have an agent for all other lock states. */
+	lock->agent = agent;
+	if (lock->update_advice)
+		/*
+		 * DEDUPING -> UPDATING transition: The location of the duplicate block changed
+		 * since the initial UDS query because of compression, rollover, or because the
+		 * query agent didn't have an allocation. The UDS update was delayed in case there
+		 * was another change in location, but with only this data_vio using the hash lock,
+		 * it's time to update the advice.
+		 */
+		start_updating(lock, agent);
+	else
+		/*
+		 * DEDUPING -> UNLOCKING transition: Release the PBN read lock on the duplicate
+		 * location so the hash lock itself can be released (contingent on no new data_vios
+		 * arriving in the lock before the agent returns).
+		 */
+		start_unlocking(lock, agent);
+}
+
+/**
+ * acquire_lock() - Get the lock for a record name.
+ * @zone: The zone responsible for the hash.
+ * @hash: The hash to lock.
+ * @replace_lock: If non-NULL, the lock already registered for the hash which should be replaced by
+ *                the new lock.
+ * @lock_ptr: A pointer to receive the hash lock.
+ *
+ * Gets the lock for the hash (record name) of the data in a data_vio, or if one does not exist (or
+ * if we are explicitly rolling over), initialize a new lock for the hash and register it in the
+ * zone. This must only be called in the correct thread for the zone.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int __must_check acquire_lock(struct hash_zone *zone,
+				     const struct uds_record_name *hash,
+				     struct hash_lock *replace_lock,
+				     struct hash_lock **lock_ptr)
+{
+	struct hash_lock *lock, *new_lock;
+	int result;
+
+	/*
+	 * Borrow and prepare a lock from the pool so we don't have to do two pointer_map accesses
+	 * in the common case of no lock contention.
+	 */
+	result = ASSERT(!list_empty(&zone->lock_pool), "never need to wait for a free hash lock");
+	if (result != VDO_SUCCESS)
+		return result;
+
+	new_lock = list_entry(zone->lock_pool.prev, struct hash_lock, pool_node);
+	list_del_init(&new_lock->pool_node);
+
+	/*
+	 * Fill in the hash of the new lock so we can map it, since we have to use the hash as the
+	 * map key.
+	 */
+	new_lock->hash = *hash;
+
+	result = vdo_pointer_map_put(zone->hash_lock_map,
+				     &new_lock->hash,
+				     new_lock,
+				     (replace_lock != NULL),
+				     (void **) &lock);
+	if (result != VDO_SUCCESS) {
+		return_hash_lock_to_pool(zone, UDS_FORGET(new_lock));
+		return result;
+	}
+
+	if (replace_lock != NULL) {
+		/* On mismatch put the old lock back and return a severe error */
+		ASSERT_LOG_ONLY(lock == replace_lock, "old lock must have been in the lock map");
+		/* TODO: Check earlier and bail out? */
+		ASSERT_LOG_ONLY(replace_lock->registered,
+				"old lock must have been marked registered");
+		replace_lock->registered = false;
+	}
+
+	if (lock == replace_lock) {
+		lock = new_lock;
+		lock->registered = true;
+	} else {
+		/* There's already a lock for the hash, so we don't need the borrowed lock. */
+		return_hash_lock_to_pool(zone, UDS_FORGET(new_lock));
+	}
+
+	*lock_ptr = lock;
+	return VDO_SUCCESS;
+}
+
+/**
+ * enter_forked_lock() - Bind the data_vio to a new hash lock.
+ *
+ * Implements waiter_callback. Binds the data_vio that was waiting to a new hash lock and waits on
+ * that lock.
+ */
+static void enter_forked_lock(struct waiter *waiter, void *context)
+{
+	struct data_vio *data_vio = waiter_as_data_vio(waiter);
+	struct hash_lock *new_lock = (struct hash_lock *) context;
+
+	set_hash_lock(data_vio, new_lock);
+	wait_on_hash_lock(new_lock, data_vio);
+}
+
+/**
+ * fork_hash_lock() - Fork a hash lock because it has run out of increments on the duplicate PBN.
+ * @old_lock: The hash lock to fork.
+ * @new_agent: The data_vio that will be the agent for the new lock.
+ *
+ * Transfers the new agent and any lock waiters to a new hash lock instance which takes the place
+ * of the old lock in the lock map. The old lock remains active, but will not update advice.
+ */
+static void fork_hash_lock(struct hash_lock *old_lock, struct data_vio *new_agent)
+{
+	struct hash_lock *new_lock;
+	int result;
+
+	result = acquire_lock(new_agent->hash_zone, &new_agent->record_name, old_lock, &new_lock);
+	if (result != VDO_SUCCESS) {
+		continue_data_vio_with_error(new_agent, result);
+		return;
+	}
+
+	/*
+	 * Only one of the two locks should update UDS. The old lock is out of references, so it
+	 * would be poor dedupe advice in the short term.
+	 */
+	old_lock->update_advice = false;
+	new_lock->update_advice = true;
+
+	set_hash_lock(new_agent, new_lock);
+	new_lock->agent = new_agent;
+
+	vdo_notify_all_waiters(&old_lock->waiters, enter_forked_lock, new_lock);
+
+	new_agent->is_duplicate = false;
+	start_writing(new_lock, new_agent);
+}
+
+/**
+ * launch_dedupe() - Reserve a reference count increment for a data_vio and launch it on the dedupe
+ *                   path.
+ * @lock: The hash lock.
+ * @data_vio: The data_vio to deduplicate using the hash lock.
+ * @has_claim: true if the data_vio already has claimed an increment from the duplicate lock.
+ *
+ * If no increments are available, this will roll over to a new hash lock and launch the data_vio
+ * as the writing agent for that lock.
+ */
+static void launch_dedupe(struct hash_lock *lock, struct data_vio *data_vio, bool has_claim)
+{
+	if (!has_claim && !vdo_claim_pbn_lock_increment(lock->duplicate_lock)) {
+		/* Out of increments, so must roll over to a new lock. */
+		fork_hash_lock(lock, data_vio);
+		return;
+	}
+
+	/* Deduplicate against the lock's verified location. */
+	set_duplicate_location(data_vio, lock->duplicate);
+	data_vio->new_mapped = data_vio->duplicate;
+	update_metadata_for_data_vio_write(data_vio, lock->duplicate_lock);
+}
+
+/**
+ * start_deduping() - Enter the hash lock state where data_vios deduplicate in parallel against a
+ *                    true copy of their data on disk.
+ * @lock: The hash lock.
+ * @agent: The data_vio acting as the agent for the lock.
+ * @agent_is_done: true only if the agent has already written or deduplicated against its data.
+ *
+ * If the agent itself needs to deduplicate, an increment for it must already have been claimed
+ * from the duplicate lock, ensuring the hash lock will still have a data_vio holding it.
+ */
+static void start_deduping(struct hash_lock *lock, struct data_vio *agent, bool agent_is_done)
+{
+	lock->state = VDO_HASH_LOCK_DEDUPING;
+
+	/*
+	 * We don't take the downgraded allocation lock from the agent unless we actually need to
+	 * deduplicate against it.
+	 */
+	if (lock->duplicate_lock == NULL) {
+		ASSERT_LOG_ONLY(!vdo_is_state_compressed(agent->new_mapped.state),
+				"compression must have shared a lock");
+		ASSERT_LOG_ONLY(agent_is_done, "agent must have written the new duplicate");
+		transfer_allocation_lock(agent);
+	}
+
+	ASSERT_LOG_ONLY(vdo_is_pbn_read_lock(lock->duplicate_lock),
+			"duplicate_lock must be a PBN read lock");
+
+	/*
+	 * This state is not like any of the other states. There is no designated agent--the agent
+	 * transitioning to this state and all the waiters will be launched to deduplicate in
+	 * parallel.
+	 */
+	lock->agent = NULL;
+
+	/*
+	 * Launch the agent (if not already deduplicated) and as many lock waiters as we have
+	 * available increments for on the dedupe path. If we run out of increments, rollover will
+	 * be triggered and the remaining waiters will be transferred to the new lock.
+	 */
+	if (!agent_is_done) {
+		launch_dedupe(lock, agent, true);
+		agent = NULL;
+	}
+	while (vdo_has_waiters(&lock->waiters))
+		launch_dedupe(lock, dequeue_lock_waiter(lock), false);
+
+	if (agent_is_done)
+		/*
+		 * In the degenerate case where all the waiters rolled over to a new lock, this
+		 * will continue to use the old agent to clean up this lock, and otherwise it just
+		 * lets the agent exit the lock.
+		 */
+		finish_deduping(lock, agent);
+}
+
+/**
+ * increment_stat() - Increment a statistic counter in a non-atomic yet thread-safe manner.
+ * @stat: The statistic field to increment.
+ */
+static void increment_stat(u64 *stat)
+{
+	/*
+	 * Must only be mutated on the hash zone thread. Prevents any compiler shenanigans from
+	 * affecting other threads reading stats.
+	 */
+	WRITE_ONCE(*stat, *stat + 1);
+}
+
+/**
+ * finish_verifying() - Handle the result of the agent for the lock comparing its data to the
+ *                      duplicate candidate.
+ * @completion: The completion of the data_vio used to verify dedupe
+ *
+ * This continuation is registered in start_verifying().
+ */
+static void finish_verifying(struct vdo_completion *completion)
+{
+	struct data_vio *agent = as_data_vio(completion);
+	struct hash_lock *lock = agent->hash_lock;
+
+	assert_hash_lock_agent(agent, __func__);
+
+	lock->verified = agent->is_duplicate;
+
+	/*
+	 * Only count the result of the initial verification of the advice as valid or stale, and
+	 * not any re-verifications due to PBN lock releases.
+	 */
+	if (!lock->verify_counted) {
+		lock->verify_counted = true;
+		if (lock->verified)
+			increment_stat(&agent->hash_zone->statistics.dedupe_advice_valid);
+		else
+			increment_stat(&agent->hash_zone->statistics.dedupe_advice_stale);
+	}
+
+	/*
+	 * Even if the block is a verified duplicate, we can't start to deduplicate unless we can
+	 * claim a reference count increment for the agent.
+	 */
+	if (lock->verified && !vdo_claim_pbn_lock_increment(lock->duplicate_lock)) {
+		agent->is_duplicate = false;
+		lock->verified = false;
+	}
+
+	if (lock->verified) {
+		/*
+		 * VERIFYING -> DEDUPING transition: The advice is for a true duplicate, so start
+		 * deduplicating against it, if references are available.
+		 */
+		start_deduping(lock, agent, false);
+	} else {
+		/*
+		 * VERIFYING -> UNLOCKING transition: Either the verify failed or we'd try to
+		 * dedupe and roll over immediately, which would fail because it would leave the
+		 * lock without an agent to release the PBN lock. In both cases, the data will have
+		 * to be written or compressed, but first the advice PBN must be unlocked by the
+		 * VERIFYING agent.
+		 */
+		lock->update_advice = true;
+		start_unlocking(lock, agent);
+	}
+}
+
+static bool blocks_equal(char *block1, char *block2)
+{
+	int i;
+
+
+	for (i = 0; i < VDO_BLOCK_SIZE; i += sizeof(u64))
+		if (*((u64 *) &block1[i]) != *((u64 *) &block2[i]))
+			return false;
+
+	return true;
+}
+
+static void verify_callback(struct vdo_completion *completion)
+{
+	struct data_vio *agent = as_data_vio(completion);
+
+	agent->is_duplicate = blocks_equal(agent->vio.data, agent->scratch_block);
+	launch_data_vio_hash_zone_callback(agent, finish_verifying);
+}
+
+static void uncompress_and_verify(struct vdo_completion *completion)
+{
+	struct data_vio *agent = as_data_vio(completion);
+	int result;
+
+	result = uncompress_data_vio(agent, agent->duplicate.state, agent->scratch_block);
+	if (result == VDO_SUCCESS) {
+		verify_callback(completion);
+		return;
+	}
+
+	agent->is_duplicate = false;
+	launch_data_vio_hash_zone_callback(agent, finish_verifying);
+}
+
+static void verify_endio(struct bio *bio)
+{
+	struct data_vio *agent = vio_as_data_vio(bio->bi_private);
+	int result = blk_status_to_errno(bio->bi_status);
+
+	vdo_count_completed_bios(bio);
+	if (result != VDO_SUCCESS) {
+		agent->is_duplicate = false;
+		launch_data_vio_hash_zone_callback(agent, finish_verifying);
+		return;
+	}
+
+	if (vdo_is_state_compressed(agent->duplicate.state)) {
+		launch_data_vio_cpu_callback(agent,
+					     uncompress_and_verify,
+					     CPU_Q_COMPRESS_BLOCK_PRIORITY);
+		return;
+	}
+
+	launch_data_vio_cpu_callback(agent, verify_callback, CPU_Q_COMPLETE_READ_PRIORITY);
+}
+
+/**
+ * start_verifying() - Begin the data verification phase.
+ * @lock: The hash lock (must be LOCKING).
+ * @agent: The data_vio to use to read and compare candidate data.
+ *
+ * Continue the deduplication path for a hash lock by using the agent to read (and possibly
+ * decompress) the data at the candidate duplicate location, comparing it to the data in the agent
+ * to verify that the candidate is identical to all the data_vios sharing the hash. If so, it can
+ * be deduplicated against, otherwise a data_vio allocation will have to be written to and used for
+ * dedupe.
+ */
+static void start_verifying(struct hash_lock *lock, struct data_vio *agent)
+{
+	int result;
+	struct vio *vio = &agent->vio;
+	char *buffer = (vdo_is_state_compressed(agent->duplicate.state) ?
+			(char *) agent->compression.block :
+			agent->scratch_block);
+
+	lock->state = VDO_HASH_LOCK_VERIFYING;
+	ASSERT_LOG_ONLY(!lock->verified, "hash lock only verifies advice once");
+
+	agent->last_async_operation = VIO_ASYNC_OP_VERIFY_DUPLICATION;
+	result = vio_reset_bio(vio, buffer, verify_endio, REQ_OP_READ, agent->duplicate.pbn);
+	if (result != VDO_SUCCESS) {
+		set_data_vio_hash_zone_callback(agent, finish_verifying);
+		continue_data_vio_with_error(agent, result);
+		return;
+	}
+
+	set_data_vio_bio_zone_callback(agent, process_vio_io);
+	vdo_launch_completion_with_priority(&vio->completion, BIO_Q_VERIFY_PRIORITY);
+}
+
+/**
+ * finish_locking() - Handle the result of the agent for the lock attempting to obtain a PBN read
+ *                    lock on the candidate duplicate block.
+ * @completion: The completion of the data_vio that attempted to get the read lock.
+ *
+ * This continuation is registered in lock_duplicate_pbn().
+ */
+static void finish_locking(struct vdo_completion *completion)
+{
+	struct data_vio *agent = as_data_vio(completion);
+	struct hash_lock *lock = agent->hash_lock;
+
+	assert_hash_lock_agent(agent, __func__);
+
+	if (!agent->is_duplicate) {
+		ASSERT_LOG_ONLY(lock->duplicate_lock == NULL,
+				"must not hold duplicate_lock if not flagged as a duplicate");
+		/*
+		 * LOCKING -> WRITING transition: The advice block is being modified or has no
+		 * available references, so try to write or compress the data, remembering to
+		 * update UDS later with the new advice.
+		 */
+		increment_stat(&agent->hash_zone->statistics.dedupe_advice_stale);
+		lock->update_advice = true;
+		start_writing(lock, agent);
+		return;
+	}
+
+	ASSERT_LOG_ONLY(lock->duplicate_lock != NULL,
+			"must hold duplicate_lock if flagged as a duplicate");
+
+	if (!lock->verified) {
+		/*
+		 * LOCKING -> VERIFYING transition: Continue on the unverified dedupe path, reading
+		 * the candidate duplicate and comparing it to the agent's data to decide whether
+		 * it is a true duplicate or stale advice.
+		 */
+		start_verifying(lock, agent);
+		return;
+	}
+
+	if (!vdo_claim_pbn_lock_increment(lock->duplicate_lock)) {
+		/*
+		 * LOCKING -> UNLOCKING transition: The verified block was re-locked, but has no
+		 * available increments left. Must first release the useless PBN read lock before
+		 * rolling over to a new copy of the block.
+		 */
+		agent->is_duplicate = false;
+		lock->verified = false;
+		lock->update_advice = true;
+		start_unlocking(lock, agent);
+		return;
+	}
+
+	/*
+	 * LOCKING -> DEDUPING transition: Continue on the verified dedupe path, deduplicating
+	 * against a location that was previously verified or written to.
+	 */
+	start_deduping(lock, agent, false);
+}
+
+static bool acquire_provisional_reference(struct data_vio *agent,
+					  struct pbn_lock *lock,
+					  struct slab_depot *depot)
+{
+	/* Ensure that the newly-locked block is referenced. */
+	struct vdo_slab *slab = vdo_get_slab(depot, agent->duplicate.pbn);
+	int result = vdo_acquire_provisional_reference(slab, agent->duplicate.pbn, lock);
+
+	if (result == VDO_SUCCESS)
+		return true;
+
+	uds_log_warning_strerror(result,
+				 "Error acquiring provisional reference for dedupe candidate; aborting dedupe");
+	agent->is_duplicate = false;
+	vdo_release_physical_zone_pbn_lock(agent->duplicate.zone, agent->duplicate.pbn, lock);
+	continue_data_vio_with_error(agent, result);
+	return false;
+}
+
+/**
+ * lock_duplicate_pbn() - Acquire a read lock on the PBN of the block containing candidate
+ *                        duplicate data (compressed or uncompressed).
+ * @completion: The completion of the data_vio attempting to acquire the physical block lock on
+ *              behalf of its hash lock.
+ *
+ * If the PBN is already locked for writing, the lock attempt is abandoned and is_duplicate will be
+ * cleared before calling back. this continuation is launched from start_locking(), and calls back
+ * to finish_locking() on the hash zone thread.
+ */
+static void lock_duplicate_pbn(struct vdo_completion *completion)
+{
+	unsigned int increment_limit;
+	struct pbn_lock *lock;
+	int result;
+
+	struct data_vio *agent = as_data_vio(completion);
+	struct slab_depot *depot = vdo_from_data_vio(agent)->depot;
+	struct physical_zone *zone = agent->duplicate.zone;
+
+	assert_data_vio_in_duplicate_zone(agent);
+
+	set_data_vio_hash_zone_callback(agent, finish_locking);
+
+	/*
+	 * While in the zone that owns it, find out how many additional references can be made to
+	 * the block if it turns out to truly be a duplicate.
+	 */
+	increment_limit = vdo_get_increment_limit(depot, agent->duplicate.pbn);
+	if (increment_limit == 0) {
+		/*
+		 * We could deduplicate against it later if a reference happened to be released
+		 * during verification, but it's probably better to bail out now.
+		 */
+		agent->is_duplicate = false;
+		continue_data_vio(agent);
+		return;
+	}
+
+	result = vdo_attempt_physical_zone_pbn_lock(zone,
+						    agent->duplicate.pbn,
+						    VIO_READ_LOCK,
+						    &lock);
+	if (result != VDO_SUCCESS) {
+		continue_data_vio_with_error(agent, result);
+		return;
+	}
+
+	if (!vdo_is_pbn_read_lock(lock)) {
+		/*
+		 * There are three cases of write locks: uncompressed data block writes, compressed
+		 * (packed) block writes, and block map page writes. In all three cases, we give up
+		 * on trying to verify the advice and don't bother to try deduplicate against the
+		 * data in the write lock holder.
+		 *
+		 * 1) We don't ever want to try to deduplicate against a block map page.
+		 *
+		 * 2a) It's very unlikely we'd deduplicate against an entire packed block, both
+		 * because of the chance of matching it, and because we don't record advice for it,
+		 * but for the uncompressed representation of all the fragments it contains. The
+		 * only way we'd be getting lock contention is if we've written the same
+		 * representation coincidentally before, had it become unreferenced, and it just
+		 * happened to be packed together from compressed writes when we go to verify the
+		 * lucky advice. Giving up is a minuscule loss of potential dedupe.
+		 *
+		 * 2b) If the advice is for a slot of a compressed block, it's about to get
+		 * smashed, and the write smashing it cannot contain our data--it would have to be
+		 * writing on behalf of our hash lock, but that's impossible since we're the lock
+		 * agent.
+		 *
+		 * 3a) If the lock is held by a data_vio with different data, the advice is already
+		 * stale or is about to become stale.
+		 *
+		 * 3b) If the lock is held by a data_vio that matches us, we may as well either
+		 * write it ourselves (or reference the copy we already wrote) instead of
+		 * potentially having many duplicates wait for the lock holder to write, journal,
+		 * hash, and finally arrive in the hash lock. We lose a chance to avoid a UDS
+		 * update in the very rare case of advice for a free block that just happened to be
+		 * allocated to a data_vio with the same hash. There's also a chance to save on a
+		 * block write, at the cost of a block verify. Saving on a full block compare in
+		 * all stale advice cases almost certainly outweighs saving a UDS update and
+		 * trading a write for a read in a lucky case where advice would have been saved
+		 * from becoming stale.
+		 */
+		agent->is_duplicate = false;
+		continue_data_vio(agent);
+		return;
+	}
+
+	if (lock->holder_count == 0) {
+		if (!acquire_provisional_reference(agent, lock, depot))
+			return;
+
+		/*
+		 * The increment limit we grabbed earlier is still valid. The lock now holds the
+		 * rights to acquire all those references. Those rights will be claimed by hash
+		 * locks sharing this read lock.
+		 */
+		lock->increment_limit = increment_limit;
+	}
+
+	/*
+	 * We've successfully acquired a read lock on behalf of the hash lock, so mark it as such.
+	 */
+	set_duplicate_lock(agent->hash_lock, lock);
+
+	/*
+	 * TODO: Optimization: We could directly launch the block verify, then switch to a hash
+	 * thread.
+	 */
+	continue_data_vio(agent);
+}
+
+/**
+ * start_locking() - Continue deduplication for a hash lock that has obtained valid advice of a
+ *                   potential duplicate through its agent.
+ * @lock: The hash lock (currently must be QUERYING).
+ * @agent: The data_vio bearing the dedupe advice.
+ */
+static void start_locking(struct hash_lock *lock, struct data_vio *agent)
+{
+	ASSERT_LOG_ONLY(lock->duplicate_lock == NULL,
+			"must not acquire a duplicate lock when already holding it");
+
+	lock->state = VDO_HASH_LOCK_LOCKING;
+
+	/*
+	 * TODO: Optimization: If we arrange to continue on the duplicate zone thread when
+	 * accepting the advice, and don't explicitly change lock states (or use an agent-local
+	 * state, or an atomic), we can avoid a thread transition here.
+	 */
+	agent->last_async_operation = VIO_ASYNC_OP_LOCK_DUPLICATE_PBN;
+	launch_data_vio_duplicate_zone_callback(agent, lock_duplicate_pbn);
+}
+
+/**
+ * finish_writing() - Re-entry point for the lock agent after it has finished writing or
+ *                    compressing its copy of the data block.
+ * @lock: The hash lock, which must be in state WRITING.
+ * @agent: The data_vio that wrote its data for the lock.
+ *
+ * The agent will never need to dedupe against anything, so it's done with the lock, but the lock
+ * may not be finished with it, as a UDS update might still be needed.
+ *
+ * If there are other lock holders, the agent will hand the job to one of them and exit, leaving
+ * the lock to deduplicate against the just-written block. If there are no other lock holders, the
+ * agent either exits (and later tears down the hash lock), or it remains the agent and updates
+ * UDS.
+ */
+static void finish_writing(struct hash_lock *lock, struct data_vio *agent)
+{
+	/*
+	 * Dedupe against the data block or compressed block slot the agent wrote. Since we know
+	 * the write succeeded, there's no need to verify it.
+	 */
+	lock->duplicate = agent->new_mapped;
+	lock->verified = true;
+
+	if (vdo_is_state_compressed(lock->duplicate.state) &&
+	    lock->registered)
+		/*
+		 * Compression means the location we gave in the UDS query is not the location
+		 * we're using to deduplicate.
+		 */
+		lock->update_advice = true;
+
+	/* If there are any waiters, we need to start deduping them. */
+	if (vdo_has_waiters(&lock->waiters)) {
+		/*
+		 * WRITING -> DEDUPING transition: an asynchronously-written block failed to
+		 * compress, so the PBN lock on the written copy was already transferred. The agent
+		 * is done with the lock, but the lock may still need to use it to clean up after
+		 * rollover.
+		 */
+		start_deduping(lock, agent, true);
+		return;
+	}
+
+	/*
+	 * There are no waiters and the agent has successfully written, so take a step towards
+	 * being able to release the hash lock (or just release it).
+	 */
+	if (lock->update_advice) {
+		/*
+		 * WRITING -> UPDATING transition: There's no waiter and a UDS update is needed, so
+		 * retain the WRITING agent and use it to launch the update. The happens on
+		 * compression, rollover, or the QUERYING agent not having an allocation.
+		 */
+		start_updating(lock, agent);
+	} else if (lock->duplicate_lock != NULL) {
+		/*
+		 * WRITING -> UNLOCKING transition: There's no waiter and no update needed, but the
+		 * compressed write gave us a shared duplicate lock that we must release.
+		 */
+		set_duplicate_location(agent, lock->duplicate);
+		start_unlocking(lock, agent);
+	} else {
+		/*
+		 * WRITING -> BYPASSING transition: There's no waiter, no update needed, and no
+		 * duplicate lock held, so both the agent and lock have no more work to do. The
+		 * agent will release its allocation lock in cleanup.
+		 */
+		start_bypassing(lock, agent);
+	}
+}
+
+/**
+ * select_writing_agent() - Search through the lock waiters for a data_vio that has an allocation.
+ * @lock: The hash lock to modify.
+ *
+ * If an allocation is found, swap agents, put the old agent at the head of the wait queue, then
+ * return the new agent. Otherwise, just return the current agent.
+ */
+static struct data_vio *select_writing_agent(struct hash_lock *lock)
+{
+	struct wait_queue temp_queue;
+	struct data_vio *data_vio;
+
+	vdo_initialize_wait_queue(&temp_queue);
+
+	/*
+	 * Move waiters to the temp queue one-by-one until we find an allocation. Not ideal to
+	 * search, but it only happens when nearly out of space.
+	 */
+	while (((data_vio = dequeue_lock_waiter(lock)) != NULL) &&
+	       !data_vio_has_allocation(data_vio)) {
+		/* Use the lower-level enqueue since we're just moving waiters around. */
+		vdo_enqueue_waiter(&temp_queue, &data_vio->waiter);
+	}
+
+	if (data_vio != NULL) {
+		/*
+		 * Move the rest of the waiters over to the temp queue, preserving the order they
+		 * arrived at the lock.
+		 */
+		vdo_transfer_all_waiters(&lock->waiters, &temp_queue);
+
+		/*
+		 * The current agent is being replaced and will have to wait to dedupe; make it the
+		 * first waiter since it was the first to reach the lock.
+		 */
+		vdo_enqueue_waiter(&lock->waiters, &lock->agent->waiter);
+		lock->agent = data_vio;
+	} else {
+		/* No one has an allocation, so keep the current agent. */
+		data_vio = lock->agent;
+	}
+
+	/* Swap all the waiters back onto the lock's queue. */
+	vdo_transfer_all_waiters(&temp_queue, &lock->waiters);
+	return data_vio;
+}
+
+/**
+ * start_writing() - Begin the non-duplicate write path.
+ * @lock: The hash lock (currently must be QUERYING).
+ * @agent: The data_vio currently acting as the agent for the lock.
+ *
+ * Begins the non-duplicate write path for a hash lock that had no advice, selecting a data_vio
+ * with an allocation as a new agent, if necessary, then resuming the agent on the data_vio write
+ * path.
+ */
+static void start_writing(struct hash_lock *lock, struct data_vio *agent)
+{
+	lock->state = VDO_HASH_LOCK_WRITING;
+
+	/*
+	 * The agent might not have received an allocation and so can't be used for writing, but
+	 * it's entirely possible that one of the waiters did.
+	 */
+	if (!data_vio_has_allocation(agent)) {
+		agent = select_writing_agent(lock);
+		/* If none of the waiters had an allocation, the writes all have to fail. */
+		if (!data_vio_has_allocation(agent)) {
+			/*
+			 * TODO: Should we keep a variant of BYPASSING that causes new arrivals to
+			 * fail immediately if they don't have an allocation? It might be possible
+			 * that on some path there would be non-waiters still referencing the lock,
+			 * so it would remain in the map as everything is currently spelled, even
+			 * if the agent and all waiters release.
+			 */
+			continue_data_vio_with_error(agent, VDO_NO_SPACE);
+			return;
+		}
+	}
+
+	/*
+	 * If the agent compresses, it might wait indefinitely in the packer, which would be bad if
+	 * there are any other data_vios waiting.
+	 */
+	if (vdo_has_waiters(&lock->waiters))
+		cancel_data_vio_compression(agent);
+
+	/*
+	 * Send the agent to the compress/pack/write path in vioWrite. If it succeeds, it will
+	 * return to the hash lock via vdo_continue_hash_lock() and call finish_writing().
+	 */
+	launch_compress_data_vio(agent);
+}
+
+/*
+ * Decode VDO duplicate advice from the old_metadata field of a UDS request.
+ * Returns true if valid advice was found and decoded
+ */
+static bool decode_uds_advice(struct dedupe_context *context)
+{
+	const struct uds_request *request = &context->request;
+	struct data_vio *data_vio = context->requestor;
+	size_t offset = 0;
+	const struct uds_record_data *encoding = &request->old_metadata;
+	struct vdo *vdo = vdo_from_data_vio(data_vio);
+	struct zoned_pbn *advice = &data_vio->duplicate;
+	u8 version;
+	int result;
+
+	if ((request->status != UDS_SUCCESS) || !request->found)
+		return false;
+
+	version = encoding->data[offset++];
+	if (version != UDS_ADVICE_VERSION) {
+		uds_log_error("invalid UDS advice version code %u", version);
+		return false;
+	}
+
+	advice->state = encoding->data[offset++];
+	advice->pbn = get_unaligned_le64(&encoding->data[offset]);
+	offset += sizeof(u64);
+	BUG_ON(offset != UDS_ADVICE_SIZE);
+
+	/* Don't use advice that's clearly meaningless. */
+	if ((advice->state == VDO_MAPPING_STATE_UNMAPPED) || (advice->pbn == VDO_ZERO_BLOCK)) {
+		uds_log_debug("Invalid advice from deduplication server: pbn %llu, state %u. Giving up on deduplication of logical block %llu",
+			      (unsigned long long) advice->pbn,
+			      advice->state,
+			      (unsigned long long) data_vio->logical.lbn);
+		atomic64_inc(&vdo->stats.invalid_advice_pbn_count);
+		return false;
+	}
+
+	result = vdo_get_physical_zone(vdo, advice->pbn, &advice->zone);
+	if ((result != VDO_SUCCESS) || (advice->zone == NULL)) {
+		uds_log_debug("Invalid physical block number from deduplication server: %llu, giving up on deduplication of logical block %llu",
+			      (unsigned long long) advice->pbn,
+			      (unsigned long long) data_vio->logical.lbn);
+		atomic64_inc(&vdo->stats.invalid_advice_pbn_count);
+		return false;
+	}
+
+	return true;
+}
+
+static void process_query_result(struct data_vio *agent)
+{
+	struct dedupe_context *context = agent->dedupe_context;
+
+	if (context == NULL)
+		return;
+
+	if (change_context_state(context, DEDUPE_CONTEXT_COMPLETE, DEDUPE_CONTEXT_IDLE)) {
+		agent->is_duplicate = decode_uds_advice(context);
+		release_context(context);
+	}
+}
+
+/**
+ * finish_querying() - Process the result of a UDS query performed by the agent for the lock.
+ * @completion: The completion of the data_vio that performed the query.
+ *
+ * This continuation is registered in start_querying().
+ */
+static void finish_querying(struct vdo_completion *completion)
+{
+	struct data_vio *agent = as_data_vio(completion);
+	struct hash_lock *lock = agent->hash_lock;
+
+	assert_hash_lock_agent(agent, __func__);
+
+	process_query_result(agent);
+
+	if (agent->is_duplicate) {
+		lock->duplicate = agent->duplicate;
+		/*
+		 * QUERYING -> LOCKING transition: Valid advice was obtained from UDS. Use the
+		 * QUERYING agent to start the hash lock on the unverified dedupe path, verifying
+		 * that the advice can be used.
+		 */
+		start_locking(lock, agent);
+	} else {
+		/*
+		 * The agent will be used as the duplicate if has an allocation; if it does, that
+		 * location was posted to UDS, so no update will be needed.
+		 */
+		lock->update_advice = !data_vio_has_allocation(agent);
+		/*
+		 * QUERYING -> WRITING transition: There was no advice or the advice wasn't valid,
+		 * so try to write or compress the data.
+		 */
+		start_writing(lock, agent);
+	}
+}
+
+/**
+ * start_querying() - Start deduplicatoin for a hash lock.
+ * @lock: The initialized hash lock.
+ * @data_vio: The data_vio that has just obtained the new lock.
+ *
+ * Starts deduplication for a hash lock that has finished initializing by making the data_vio that
+ * requested it the agent, entering the QUERYING state, and using the agent to perform the UDS
+ * query on behalf of the lock.
+ */
+static void start_querying(struct hash_lock *lock, struct data_vio *data_vio)
+{
+	lock->agent = data_vio;
+	lock->state = VDO_HASH_LOCK_QUERYING;
+	data_vio->last_async_operation = VIO_ASYNC_OP_CHECK_FOR_DUPLICATION;
+	set_data_vio_hash_zone_callback(data_vio, finish_querying);
+	query_index(data_vio, (data_vio_has_allocation(data_vio) ? UDS_POST : UDS_QUERY));
+}
+
+/**
+ * report_bogus_lock_state() - Complain that a data_vio has entered a hash_lock that is in an
+ *                             unimplemented or unusable state and continue the data_vio with an
+ *                             error.
+ * @lock: The hash lock.
+ * @data_vio: The data_vio attempting to enter the lock.
+ */
+static void report_bogus_lock_state(struct hash_lock *lock, struct data_vio *data_vio)
+{
+	ASSERT_LOG_ONLY(false,
+			"hash lock must not be in unimplemented state %s",
+			get_hash_lock_state_name(lock->state));
+	continue_data_vio_with_error(data_vio, VDO_LOCK_ERROR);
+}
+
+/**
+ * vdo_continue_hash_lock() - Continue the processing state after writing, compressing, or
+ *                            deduplicating.
+ * @data_vio: The data_vio to continue processing in its hash lock.
+ *
+ * Asynchronously continue processing a data_vio in its hash lock after it has finished writing,
+ * compressing, or deduplicating, so it can share the result with any data_vios waiting in the hash
+ * lock, or update the UDS index, or simply release its share of the lock.
+ *
+ * Context: This must only be called in the correct thread for the hash zone.
+ */
+void vdo_continue_hash_lock(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+	struct hash_lock *lock = data_vio->hash_lock;
+
+	switch (lock->state) {
+	case VDO_HASH_LOCK_WRITING:
+		ASSERT_LOG_ONLY(data_vio == lock->agent,
+				"only the lock agent may continue the lock");
+		finish_writing(lock, data_vio);
+		break;
+
+	case VDO_HASH_LOCK_DEDUPING:
+		finish_deduping(lock, data_vio);
+		break;
+
+	case VDO_HASH_LOCK_BYPASSING:
+		/* This data_vio has finished the write path and the lock doesn't need it. */
+		exit_hash_lock(data_vio);
+		break;
+
+	case VDO_HASH_LOCK_INITIALIZING:
+	case VDO_HASH_LOCK_QUERYING:
+	case VDO_HASH_LOCK_UPDATING:
+	case VDO_HASH_LOCK_LOCKING:
+	case VDO_HASH_LOCK_VERIFYING:
+	case VDO_HASH_LOCK_UNLOCKING:
+		/* A lock in this state should never be re-entered. */
+		report_bogus_lock_state(lock, data_vio);
+		break;
+
+	default:
+		report_bogus_lock_state(lock, data_vio);
+	}
+}
+
+/**
+ * is_hash_collision() - Check to see if a hash collision has occurred.
+ * @lock: The lock to check.
+ * @candidate: The data_vio seeking to share the lock.
+ *
+ * Check whether the data in data_vios sharing a lock is different than in a data_vio seeking to
+ * share the lock, which should only be possible in the extremely unlikely case of a hash
+ * collision.
+ *
+ * Return: true if the given data_vio must not share the lock because it doesn't have the same data
+ *         as the lock holders.
+ */
+static bool is_hash_collision(struct hash_lock *lock, struct data_vio *candidate)
+{
+	struct data_vio *lock_holder;
+	struct hash_zone *zone;
+	bool collides;
+
+	if (list_empty(&lock->duplicate_ring))
+		return false;
+
+	lock_holder = list_first_entry(&lock->duplicate_ring, struct data_vio, hash_lock_entry);
+	zone = candidate->hash_zone;
+	collides = !blocks_equal(lock_holder->vio.data, candidate->vio.data);
+	if (collides)
+		increment_stat(&zone->statistics.concurrent_hash_collisions);
+	else
+		increment_stat(&zone->statistics.concurrent_data_matches);
+
+	return collides;
+}
+
+static inline int assert_hash_lock_preconditions(const struct data_vio *data_vio)
+{
+	int result;
+
+	/* FIXME: BUG_ON() and/or enter read-only mode? */
+	result = ASSERT(data_vio->hash_lock == NULL, "must not already hold a hash lock");
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = ASSERT(list_empty(&data_vio->hash_lock_entry),
+			"must not already be a member of a hash lock ring");
+	if (result != VDO_SUCCESS)
+		return result;
+
+	return ASSERT(data_vio->recovery_sequence_number == 0,
+		      "must not hold a recovery lock when getting a hash lock");
+}
+
+/**
+ * vdo_acquire_hash_lock() - Acquire or share a lock on a record name.
+ * @data_vio: The data_vio acquiring a lock on its record name.
+ *
+ * Acquire or share a lock on the hash (record name) of the data in a data_vio, updating the
+ * data_vio to reference the lock. This must only be called in the correct thread for the zone. In
+ * the unlikely case of a hash collision, this function will succeed, but the data_vio will not get
+ * a lock reference.
+ */
+void vdo_acquire_hash_lock(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+	struct hash_lock *lock;
+	int result;
+
+	assert_data_vio_in_hash_zone(data_vio);
+
+	result = assert_hash_lock_preconditions(data_vio);
+	if (result != VDO_SUCCESS) {
+		continue_data_vio_with_error(data_vio, result);
+		return;
+	}
+
+	result = acquire_lock(data_vio->hash_zone, &data_vio->record_name, NULL, &lock);
+	if (result != VDO_SUCCESS) {
+		continue_data_vio_with_error(data_vio, result);
+		return;
+	}
+
+	if (is_hash_collision(lock, data_vio)) {
+		/*
+		 * Hash collisions are extremely unlikely, but the bogus dedupe would be a data
+		 * corruption. Bypass optimization entirely. We can't compress a data_vio without
+		 * a hash_lock as the compressed write depends on the hash_lock to manage the
+		 * references for the compressed block.
+		 */
+		write_data_vio(data_vio);
+		return;
+	}
+
+	set_hash_lock(data_vio, lock);
+	switch (lock->state) {
+	case VDO_HASH_LOCK_INITIALIZING:
+		start_querying(lock, data_vio);
+		return;
+
+	case VDO_HASH_LOCK_QUERYING:
+	case VDO_HASH_LOCK_WRITING:
+	case VDO_HASH_LOCK_UPDATING:
+	case VDO_HASH_LOCK_LOCKING:
+	case VDO_HASH_LOCK_VERIFYING:
+	case VDO_HASH_LOCK_UNLOCKING:
+		/* The lock is busy, and can't be shared yet. */
+		wait_on_hash_lock(lock, data_vio);
+		return;
+
+	case VDO_HASH_LOCK_BYPASSING:
+		/* We can't use this lock, so bypass optimization entirely. */
+		vdo_release_hash_lock(data_vio);
+		write_data_vio(data_vio);
+		return;
+
+	case VDO_HASH_LOCK_DEDUPING:
+		launch_dedupe(lock, data_vio, false);
+		return;
+
+	default:
+		/* A lock in this state should not be acquired by new VIOs. */
+		report_bogus_lock_state(lock, data_vio);
+	}
+}
+
+/**
+ * vdo_release_hash_lock() - Release a data_vio's share of a hash lock, if held, and null out the
+ *                           data_vio's reference to it.
+ * @data_vio: The data_vio releasing its hash lock.
+ *
+ * If the data_vio is the only one holding the lock, this also releases any resources or locks used
+ * by the hash lock (such as a PBN read lock on a block containing data with the same hash) and
+ * returns the lock to the hash zone's lock pool.
+ *
+ * Context: This must only be called in the correct thread for the hash zone.
+ */
+void vdo_release_hash_lock(struct data_vio *data_vio)
+{
+	struct hash_lock *lock = data_vio->hash_lock;
+	struct hash_zone *zone = data_vio->hash_zone;
+
+	if (lock == NULL)
+		return;
+
+	set_hash_lock(data_vio, NULL);
+
+	if (lock->reference_count > 0)
+		/* The lock is still in use by other data_vios. */
+		return;
+
+	if (lock->registered) {
+		struct hash_lock *removed;
+
+		removed = vdo_pointer_map_remove(zone->hash_lock_map, &lock->hash);
+		ASSERT_LOG_ONLY(lock == removed, "hash lock being released must have been mapped");
+	} else {
+		ASSERT_LOG_ONLY(lock != vdo_pointer_map_get(zone->hash_lock_map, &lock->hash),
+				"unregistered hash lock must not be in the lock map");
+	}
+
+	ASSERT_LOG_ONLY(!vdo_has_waiters(&lock->waiters),
+			"hash lock returned to zone must have no waiters");
+	ASSERT_LOG_ONLY((lock->duplicate_lock == NULL),
+			"hash lock returned to zone must not reference a PBN lock");
+	ASSERT_LOG_ONLY((lock->state == VDO_HASH_LOCK_BYPASSING),
+			"returned hash lock must not be in use with state %s",
+			get_hash_lock_state_name(lock->state));
+	ASSERT_LOG_ONLY(list_empty(&lock->pool_node),
+			"hash lock returned to zone must not be in a pool ring");
+	ASSERT_LOG_ONLY(list_empty(&lock->duplicate_ring),
+			"hash lock returned to zone must not reference DataVIOs");
+
+	return_hash_lock_to_pool(zone, lock);
+}
+
+/**
+ * transfer_allocation_lock() - Transfer a data_vio's downgraded allocation PBN lock to the
+ *                              data_vio's hash lock, converting it to a duplicate PBN lock.
+ * @data_vio: The data_vio holding the allocation lock to transfer.
+ */
+static void transfer_allocation_lock(struct data_vio *data_vio)
+{
+	struct allocation *allocation = &data_vio->allocation;
+	struct hash_lock *hash_lock = data_vio->hash_lock;
+
+	ASSERT_LOG_ONLY(data_vio->new_mapped.pbn == allocation->pbn,
+			"transferred lock must be for the block written");
+
+	allocation->pbn = VDO_ZERO_BLOCK;
+
+	ASSERT_LOG_ONLY(vdo_is_pbn_read_lock(allocation->lock),
+			"must have downgraded the allocation lock before transfer");
+
+	hash_lock->duplicate = data_vio->new_mapped;
+	data_vio->duplicate = data_vio->new_mapped;
+
+	/*
+	 * Since the lock is being transferred, the holder count doesn't change (and isn't even
+	 * safe to examine on this thread).
+	 */
+	hash_lock->duplicate_lock = UDS_FORGET(allocation->lock);
+}
+
+/**
+ * vdo_share_compressed_write_lock() - Make a data_vio's hash lock a shared holder of the PBN lock
+ *                                     on the compressed block to which its data was just written.
+ * @data_vio: The data_vio which was just compressed.
+ * @pbn_lock: The PBN lock on the compressed block.
+ *
+ * If the lock is still a write lock (as it will be for the first share), it will be converted to a
+ * read lock. This also reserves a reference count increment for the data_vio.
+ */
+void vdo_share_compressed_write_lock(struct data_vio *data_vio, struct pbn_lock *pbn_lock)
+{
+	bool claimed;
+
+	ASSERT_LOG_ONLY(vdo_get_duplicate_lock(data_vio) == NULL,
+			"a duplicate PBN lock should not exist when writing");
+	ASSERT_LOG_ONLY(vdo_is_state_compressed(data_vio->new_mapped.state),
+			"lock transfer must be for a compressed write");
+	assert_data_vio_in_new_mapped_zone(data_vio);
+
+	/* First sharer downgrades the lock. */
+	if (!vdo_is_pbn_read_lock(pbn_lock))
+		vdo_downgrade_pbn_write_lock(pbn_lock, true);
+
+	/*
+	 * Get a share of the PBN lock, ensuring it cannot be released until after this data_vio
+	 * has had a chance to journal a reference.
+	 */
+	data_vio->duplicate = data_vio->new_mapped;
+	data_vio->hash_lock->duplicate = data_vio->new_mapped;
+	set_duplicate_lock(data_vio->hash_lock, pbn_lock);
+
+	/*
+	 * Claim a reference for this data_vio. Necessary since another hash_lock might start
+	 * deduplicating against it before our incRef.
+	 */
+	claimed = vdo_claim_pbn_lock_increment(pbn_lock);
+	ASSERT_LOG_ONLY(claimed, "impossible to fail to claim an initial increment");
+}
+
+/** compare_keys() - Implements pointer_key_comparator. */
+static bool compare_keys(const void *this_key, const void *that_key)
+{
+	/* Null keys are not supported. */
+	return (memcmp(this_key, that_key, sizeof(struct uds_record_name)) == 0);
+}
+
+/** hash_key() - Implements pointer_key_comparator. */
+static u32 hash_key(const void *key)
+{
+	const struct uds_record_name *name = key;
+
+	/* Use a fragment of the record name as a hash code. */
+	return get_unaligned_le32(&name->name[4]);
+}
+
+static void dedupe_kobj_release(struct kobject *directory)
+{
+	UDS_FREE(container_of(directory, struct hash_zones, dedupe_directory));
+}
+
+static ssize_t dedupe_status_show(struct kobject *directory, struct attribute *attr, char *buf)
+{
+	struct uds_attribute *ua = container_of(attr, struct uds_attribute, attr);
+	struct hash_zones *zones = container_of(directory, struct hash_zones, dedupe_directory);
+
+	if (ua->show_string != NULL)
+		return sprintf(buf, "%s\n", ua->show_string(zones));
+	else
+		return -EINVAL;
+}
+
+static ssize_t dedupe_status_store(struct kobject *kobj __always_unused,
+				   struct attribute *attr __always_unused,
+				   const char *buf __always_unused,
+				   size_t length __always_unused)
+{
+	return -EINVAL;
+}
+
+/*----------------------------------------------------------------------*/
+
+static const struct sysfs_ops dedupe_sysfs_ops = {
+	.show = dedupe_status_show,
+	.store = dedupe_status_store,
+};
+
+static struct uds_attribute dedupe_status_attribute = {
+	.attr = {.name = "status", .mode = 0444, },
+	.show_string = vdo_get_dedupe_index_state_name,
+};
+
+static struct attribute *dedupe_attrs[] = {
+	&dedupe_status_attribute.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(dedupe);
+
+static struct kobj_type dedupe_directory_type = {
+	.release = dedupe_kobj_release,
+	.sysfs_ops = &dedupe_sysfs_ops,
+	.default_groups = dedupe_groups,
+};
+
+static void start_uds_queue(void *ptr)
+{
+	/*
+	 * Allow the UDS dedupe worker thread to do memory allocations. It will only do allocations
+	 * during the UDS calls that open or close an index, but those allocations can safely sleep
+	 * while reserving a large amount of memory. We could use an allocations_allowed boolean
+	 * (like the base threads do), but it would be an unnecessary embellishment.
+	 */
+	struct vdo_thread *thread = vdo_get_work_queue_owner(vdo_get_current_work_queue());
+
+	uds_register_allocating_thread(&thread->allocating_thread, NULL);
+}
+
+static void finish_uds_queue(void *ptr __always_unused)
+{
+	uds_unregister_allocating_thread();
+}
+
+static void close_index(struct hash_zones *zones)
+{
+	int result;
+
+	/*
+	 * Change the index state so that get_index_statistics() will not try to use the index
+	 * session we are closing.
+	 */
+	zones->index_state = IS_CHANGING;
+	/* Close the index session, while not holding the lock. */
+	spin_unlock(&zones->lock);
+	result = uds_close_index(zones->index_session);
+
+	if (result != UDS_SUCCESS)
+		uds_log_error_strerror(result, "Error closing index");
+	spin_lock(&zones->lock);
+	zones->index_state = IS_CLOSED;
+	zones->error_flag |= result != UDS_SUCCESS;
+	/* ASSERTION: We leave in IS_CLOSED state. */
+}
+
+static void open_index(struct hash_zones *zones)
+{
+	/* ASSERTION: We enter in IS_CLOSED state. */
+	int result;
+	bool create_flag = zones->create_flag;
+
+	zones->create_flag = false;
+	/*
+	 * Change the index state so that the it will be reported to the outside world as
+	 * "opening".
+	 */
+	zones->index_state = IS_CHANGING;
+	zones->error_flag = false;
+
+	/* Open the index session, while not holding the lock */
+	spin_unlock(&zones->lock);
+	result = uds_open_index(create_flag ? UDS_CREATE : UDS_LOAD,
+				&zones->parameters,
+				zones->index_session);
+	if (result != UDS_SUCCESS)
+		uds_log_error_strerror(result, "Error opening index");
+
+	spin_lock(&zones->lock);
+	if (!create_flag) {
+		switch (result) {
+		case -ENOENT:
+			/*
+			 * Either there is no index, or there is no way we can recover the index.
+			 * We will be called again and try to create a new index.
+			 */
+			zones->index_state = IS_CLOSED;
+			zones->create_flag = true;
+			return;
+		default:
+			break;
+		}
+	}
+	if (result == UDS_SUCCESS) {
+		zones->index_state = IS_OPENED;
+	} else {
+		zones->index_state = IS_CLOSED;
+		zones->index_target = IS_CLOSED;
+		zones->error_flag = true;
+		spin_unlock(&zones->lock);
+		uds_log_info("Setting UDS index target state to error");
+		spin_lock(&zones->lock);
+	}
+	/*
+	 * ASSERTION: On success, we leave in IS_OPENED state.
+	 * ASSERTION: On failure, we leave in IS_CLOSED state.
+	 */
+}
+
+static void change_dedupe_state(struct vdo_completion *completion)
+{
+	struct hash_zones *zones = as_hash_zones(completion);
+
+	spin_lock(&zones->lock);
+
+	/* Loop until the index is in the target state and the create flag is clear. */
+	while (vdo_is_state_normal(&zones->state) &&
+	       ((zones->index_state != zones->index_target) || zones->create_flag)) {
+		if (zones->index_state == IS_OPENED)
+			close_index(zones);
+		else
+			open_index(zones);
+	}
+
+	zones->changing = false;
+	spin_unlock(&zones->lock);
+}
+
+static void start_expiration_timer(struct dedupe_context *context)
+{
+	u64 start_time = context->submission_jiffies;
+	u64 end_time;
+
+	if (!change_timer_state(context->zone,
+				DEDUPE_QUERY_TIMER_IDLE,
+				DEDUPE_QUERY_TIMER_RUNNING))
+		return;
+
+	end_time = max(start_time + vdo_dedupe_index_timeout_jiffies,
+		       jiffies + vdo_dedupe_index_min_timer_jiffies);
+	mod_timer(&context->zone->timer, end_time);
+}
+
+/**
+ * report_dedupe_timeouts() - Record and eventually report that some dedupe requests reached their
+ *                            expiration time without getting answers, so we timed them out.
+ * @zones: the hash zones.
+ * @timeouts: the number of newly timed out requests.
+ */
+static void report_dedupe_timeouts(struct hash_zones *zones, unsigned int timeouts)
+{
+	atomic64_add(timeouts, &zones->timeouts);
+	spin_lock(&zones->lock);
+	if (__ratelimit(&zones->ratelimiter)) {
+		u64 unreported = atomic64_read(&zones->timeouts);
+
+		unreported -= zones->reported_timeouts;
+		uds_log_debug("UDS index timeout on %llu requests",
+			      (unsigned long long) unreported);
+		zones->reported_timeouts += unreported;
+	}
+	spin_unlock(&zones->lock);
+}
+
+static int initialize_index(struct vdo *vdo, struct hash_zones *zones)
+{
+	int result;
+	off_t uds_offset;
+	struct volume_geometry geometry = vdo->geometry;
+	static const struct vdo_work_queue_type uds_queue_type = {
+		.start = start_uds_queue,
+		.finish = finish_uds_queue,
+		.max_priority = UDS_Q_MAX_PRIORITY,
+		.default_priority = UDS_Q_PRIORITY,
+	};
+
+	vdo_set_dedupe_index_timeout_interval(vdo_dedupe_index_timeout_interval);
+	vdo_set_dedupe_index_min_timer_interval(vdo_dedupe_index_min_timer_interval);
+
+	/*
+	 * Since we will save up the timeouts that would have been reported but were ratelimited,
+	 * we don't need to report ratelimiting.
+	 */
+	ratelimit_default_init(&zones->ratelimiter);
+	ratelimit_set_flags(&zones->ratelimiter, RATELIMIT_MSG_ON_RELEASE);
+	uds_offset = ((vdo_get_index_region_start(geometry) -
+		       geometry.bio_offset) * VDO_BLOCK_SIZE);
+	zones->parameters = (struct uds_parameters) {
+		.name = vdo->device_config->parent_device_name,
+		.offset = uds_offset,
+		.size = (vdo_get_index_region_size(geometry) * VDO_BLOCK_SIZE),
+		.memory_size = geometry.index_config.mem,
+		.sparse = geometry.index_config.sparse,
+		.nonce = (u64) geometry.nonce,
+	};
+
+	result = uds_create_index_session(&zones->index_session);
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = vdo_make_thread(vdo, vdo->thread_config.dedupe_thread, &uds_queue_type, 1, NULL);
+	if (result != VDO_SUCCESS) {
+		uds_destroy_index_session(UDS_FORGET(zones->index_session));
+		uds_log_error("UDS index queue initialization failed (%d)", result);
+		return result;
+	}
+
+	vdo_initialize_completion(&zones->completion, vdo, VDO_HASH_ZONES_COMPLETION);
+	vdo_set_completion_callback(&zones->completion,
+				    change_dedupe_state,
+				    vdo->thread_config.dedupe_thread);
+	kobject_init(&zones->dedupe_directory, &dedupe_directory_type);
+	return VDO_SUCCESS;
+}
+
+/**
+ * finish_index_operation(): This is the UDS callback for index queries.
+ * @request: The uds request which has just completed.
+ */
+static void finish_index_operation(struct uds_request *request)
+{
+	struct dedupe_context *context = container_of(request, struct dedupe_context, request);
+
+	if (change_context_state(context, DEDUPE_CONTEXT_PENDING, DEDUPE_CONTEXT_COMPLETE)) {
+		/*
+		 * This query has not timed out, so send its data_vio back to its hash zone to
+		 * process the results.
+		 */
+		continue_data_vio(context->requestor);
+		return;
+	}
+
+	/*
+	 * This query has timed out, so try to mark it complete and hence eligible for reuse. Its
+	 * data_vio has already moved on.
+	 */
+	if (!change_context_state(context,
+				  DEDUPE_CONTEXT_TIMED_OUT,
+				  DEDUPE_CONTEXT_TIMED_OUT_COMPLETE))
+		ASSERT_LOG_ONLY(false,
+				"uds request was timed out (state %d)",
+				atomic_read(&context->state));
+
+	uds_funnel_queue_put(context->zone->timed_out_complete, &context->queue_entry);
+}
+
+/**
+ * check_for_drain_complete() - Check whether this zone has drained.
+ * @zone: The zone to check.
+ */
+static void check_for_drain_complete(struct hash_zone *zone)
+{
+	data_vio_count_t recycled = 0;
+
+	if (!vdo_is_state_draining(&zone->state))
+		return;
+
+	if ((atomic_read(&zone->timer_state) == DEDUPE_QUERY_TIMER_IDLE) ||
+	    change_timer_state(zone, DEDUPE_QUERY_TIMER_RUNNING, DEDUPE_QUERY_TIMER_IDLE))
+		del_timer_sync(&zone->timer);
+	else
+		/*
+		 * There is an in flight time-out, which must get processed before we can continue.
+		 */
+		return;
+
+	for (;;) {
+		struct dedupe_context *context;
+		struct funnel_queue_entry *entry;
+
+		entry = uds_funnel_queue_poll(zone->timed_out_complete);
+		if (entry == NULL)
+			break;
+
+		context = container_of(entry, struct dedupe_context, queue_entry);
+		atomic_set(&context->state, DEDUPE_CONTEXT_IDLE);
+		list_add(&context->list_entry, &zone->available);
+		recycled++;
+	}
+
+	if (recycled > 0)
+		WRITE_ONCE(zone->active, zone->active - recycled);
+	ASSERT_LOG_ONLY(READ_ONCE(zone->active) == 0, "all contexts inactive");
+	vdo_finish_draining(&zone->state);
+}
+
+static void timeout_index_operations_callback(struct vdo_completion *completion)
+{
+	struct dedupe_context *context, *tmp;
+	struct hash_zone *zone = as_hash_zone(completion);
+	u64 timeout_jiffies = msecs_to_jiffies(vdo_dedupe_index_timeout_interval);
+	unsigned long cutoff = jiffies - timeout_jiffies;
+	unsigned int timed_out = 0;
+
+	atomic_set(&zone->timer_state, DEDUPE_QUERY_TIMER_IDLE);
+	list_for_each_entry_safe(context, tmp, &zone->pending, list_entry) {
+		if (cutoff <= context->submission_jiffies) {
+			/*
+			 * We have reached the oldest query which has not timed out yet, so restart
+			 * the timer.
+			 */
+			start_expiration_timer(context);
+			break;
+		}
+
+		if (!change_context_state(context,
+					  DEDUPE_CONTEXT_PENDING,
+					  DEDUPE_CONTEXT_TIMED_OUT))
+			/*
+			 * This context completed between the time the timeout fired, and now. We
+			 * can treat it as a a successful query, its requestor is already enqueued
+			 * to process it.
+			 */
+			continue;
+
+		/*
+		 * Remove this context from the pending list so we won't look at it again on a
+		 * subsequent timeout. Once the index completes it, it will be reused. Meanwhile,
+		 * send its requestor on its way.
+		 */
+		list_del_init(&context->list_entry);
+		continue_data_vio(context->requestor);
+		timed_out++;
+	}
+
+	if (timed_out > 0)
+		report_dedupe_timeouts(completion->vdo->hash_zones, timed_out);
+
+	check_for_drain_complete(zone);
+}
+
+static void timeout_index_operations(struct timer_list *t)
+{
+	struct hash_zone *zone = from_timer(zone, t, timer);
+
+	if (change_timer_state(zone, DEDUPE_QUERY_TIMER_RUNNING, DEDUPE_QUERY_TIMER_FIRED))
+		vdo_launch_completion(&zone->completion);
+}
+
+static int __must_check
+initialize_zone(struct vdo *vdo, struct hash_zones *zones, zone_count_t zone_number)
+{
+	int result;
+	data_vio_count_t i;
+	struct hash_zone *zone = &zones->zones[zone_number];
+
+	result = vdo_make_pointer_map(VDO_LOCK_MAP_CAPACITY,
+				      0,
+				      compare_keys,
+				      hash_key,
+				      &zone->hash_lock_map);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	vdo_set_admin_state_code(&zone->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+	zone->zone_number = zone_number;
+	zone->thread_id = vdo->thread_config.hash_zone_threads[zone_number];
+	vdo_initialize_completion(&zone->completion, vdo, VDO_HASH_ZONE_COMPLETION);
+	vdo_set_completion_callback(&zone->completion,
+				    timeout_index_operations_callback,
+				    zone->thread_id);
+	INIT_LIST_HEAD(&zone->lock_pool);
+	result = UDS_ALLOCATE(LOCK_POOL_CAPACITY,
+			      struct hash_lock,
+			      "hash_lock array",
+			      &zone->lock_array);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	for (i = 0; i < LOCK_POOL_CAPACITY; i++)
+		return_hash_lock_to_pool(zone, &zone->lock_array[i]);
+
+	INIT_LIST_HEAD(&zone->available);
+	INIT_LIST_HEAD(&zone->pending);
+	result = uds_make_funnel_queue(&zone->timed_out_complete);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	timer_setup(&zone->timer, timeout_index_operations, 0);
+
+	for (i = 0; i < MAXIMUM_VDO_USER_VIOS; i++) {
+		struct dedupe_context *context = &zone->contexts[i];
+
+		context->zone = zone;
+		context->request.callback = finish_index_operation;
+		context->request.session = zones->index_session;
+		list_add(&context->list_entry, &zone->available);
+	}
+
+	return vdo_make_default_thread(vdo, zone->thread_id);
+}
+
+/** get_thread_id_for_zone() - Implements vdo_zone_thread_getter. */
+static thread_id_t get_thread_id_for_zone(void *context, zone_count_t zone_number)
+{
+	struct hash_zones *zones = context;
+
+	return zones->zones[zone_number].thread_id;
+}
+
+/**
+ * vdo_make_hash_zones() - Create the hash zones.
+ *
+ * @vdo: The vdo to which the zone will belong.
+ * @zones_ptr: A pointer to hold the zones.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_make_hash_zones(struct vdo *vdo, struct hash_zones **zones_ptr)
+{
+	int result;
+	struct hash_zones *zones;
+	zone_count_t z;
+	zone_count_t zone_count = vdo->thread_config.hash_zone_count;
+
+	if (zone_count == 0)
+		return VDO_SUCCESS;
+
+	result = UDS_ALLOCATE_EXTENDED(struct hash_zones,
+				       zone_count,
+				       struct hash_zone,
+				       __func__,
+				       &zones);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = initialize_index(vdo, zones);
+	if (result != VDO_SUCCESS) {
+		UDS_FREE(zones);
+		return result;
+	}
+
+	vdo_set_admin_state_code(&zones->state, VDO_ADMIN_STATE_NEW);
+
+	zones->zone_count = zone_count;
+	for (z = 0; z < zone_count; z++) {
+		result = initialize_zone(vdo, zones, z);
+		if (result != VDO_SUCCESS) {
+			vdo_free_hash_zones(zones);
+			return result;
+		}
+	}
+
+	result = vdo_make_action_manager(zones->zone_count,
+					 get_thread_id_for_zone,
+					 vdo->thread_config.admin_thread,
+					 zones,
+					 NULL,
+					 vdo,
+					 &zones->manager);
+	if (result != VDO_SUCCESS) {
+		vdo_free_hash_zones(zones);
+		return result;
+	}
+
+	*zones_ptr = zones;
+	return VDO_SUCCESS;
+}
+
+void vdo_finish_dedupe_index(struct hash_zones *zones)
+{
+	if (zones == NULL)
+		return;
+
+	uds_destroy_index_session(UDS_FORGET(zones->index_session));
+}
+
+/**
+ * vdo_free_hash_zones() - Free the hash zones.
+ * @zones: The zone to free.
+ */
+void vdo_free_hash_zones(struct hash_zones *zones)
+{
+	zone_count_t i;
+
+	if (zones == NULL)
+		return;
+
+	UDS_FREE(UDS_FORGET(zones->manager));
+
+	for (i = 0; i < zones->zone_count; i++) {
+		struct hash_zone *zone = &zones->zones[i];
+
+		uds_free_funnel_queue(UDS_FORGET(zone->timed_out_complete));
+		vdo_free_pointer_map(UDS_FORGET(zone->hash_lock_map));
+		UDS_FREE(UDS_FORGET(zone->lock_array));
+	}
+
+	if (zones->index_session != NULL)
+		vdo_finish_dedupe_index(zones);
+
+	ratelimit_state_exit(&zones->ratelimiter);
+	if (vdo_get_admin_state_code(&zones->state) == VDO_ADMIN_STATE_NEW)
+		UDS_FREE(zones);
+	else
+		kobject_put(&zones->dedupe_directory);
+}
+
+static void initiate_suspend_index(struct admin_state *state)
+{
+	struct hash_zones *zones = container_of(state, struct hash_zones, state);
+	enum index_state index_state;
+
+	spin_lock(&zones->lock);
+	index_state = zones->index_state;
+	spin_unlock(&zones->lock);
+
+	if (index_state != IS_CLOSED) {
+		bool save = vdo_is_state_saving(&zones->state);
+		int result;
+
+		result = uds_suspend_index_session(zones->index_session, save);
+		if (result != UDS_SUCCESS)
+			uds_log_error_strerror(result, "Error suspending dedupe index");
+	}
+
+	vdo_finish_draining(state);
+}
+
+/**
+ * suspend_index() - Suspend the UDS index prior to draining hash zones.
+ *
+ * Implements vdo_action_preamble
+ */
+static void suspend_index(void *context, struct vdo_completion *completion)
+{
+	struct hash_zones *zones = context;
+
+	vdo_start_draining(&zones->state,
+			   vdo_get_current_manager_operation(zones->manager),
+			   completion,
+			   initiate_suspend_index);
+}
+
+/**
+ * initiate_drain() - Initiate a drain.
+ *
+ * Implements vdo_admin_initiator.
+ */
+static void initiate_drain(struct admin_state *state)
+{
+	check_for_drain_complete(container_of(state, struct hash_zone, state));
+}
+
+/**
+ * drain_hash_zone() - Drain a hash zone.
+ *
+ * Implements vdo_zone_action.
+ */
+static void drain_hash_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent)
+{
+	struct hash_zones *zones = context;
+
+	vdo_start_draining(&zones->zones[zone_number].state,
+			   vdo_get_current_manager_operation(zones->manager),
+			   parent,
+			   initiate_drain);
+}
+
+/** vdo_drain_hash_zones() - Drain all hash zones. */
+void vdo_drain_hash_zones(struct hash_zones *zones, struct vdo_completion *parent)
+{
+	vdo_schedule_operation(zones->manager,
+			       parent->vdo->suspend_type,
+			       suspend_index,
+			       drain_hash_zone,
+			       NULL,
+			       parent);
+}
+
+static void launch_dedupe_state_change(struct hash_zones *zones)
+{
+	/* ASSERTION: We enter with the lock held. */
+	if (zones->changing || !vdo_is_state_normal(&zones->state))
+		/* Either a change is already in progress, or changes are not allowed. */
+		return;
+
+	if (zones->create_flag || (zones->index_state != zones->index_target)) {
+		zones->changing = true;
+		vdo_launch_completion(&zones->completion);
+		return;
+	}
+
+	/* ASSERTION: We exit with the lock held. */
+}
+
+/**
+ * resume_index() - Resume the UDS index prior to resuming hash zones.
+ *
+ * Implements vdo_action_preamble
+ */
+static void resume_index(void *context, struct vdo_completion *parent)
+{
+	struct hash_zones *zones = context;
+	struct device_config *config = parent->vdo->device_config;
+	int result;
+
+	zones->parameters.name = config->parent_device_name;
+	result = uds_resume_index_session(zones->index_session, zones->parameters.name);
+	if (result != UDS_SUCCESS)
+		uds_log_error_strerror(result, "Error resuming dedupe index");
+
+	spin_lock(&zones->lock);
+	vdo_resume_if_quiescent(&zones->state);
+
+	if (config->deduplication) {
+		zones->index_target = IS_OPENED;
+		WRITE_ONCE(zones->dedupe_flag, true);
+	} else {
+		zones->index_target = IS_CLOSED;
+	}
+
+	launch_dedupe_state_change(zones);
+	spin_unlock(&zones->lock);
+
+	vdo_finish_completion(parent);
+}
+
+/**
+ * resume_hash_zone() - Resume a hash zone.
+ *
+ * Implements vdo_zone_action.
+ */
+static void
+resume_hash_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent)
+{
+	struct hash_zone *zone = &(((struct hash_zones *) context)->zones[zone_number]);
+
+	vdo_fail_completion(parent, vdo_resume_if_quiescent(&zone->state));
+}
+
+/**
+ * vdo_resume_hash_zones() - Resume a set of hash zones.
+ * @zones: The hash zones to resume.
+ * @parent: The object to notify when the zones have resumed.
+ */
+void vdo_resume_hash_zones(struct hash_zones *zones, struct vdo_completion *parent)
+{
+	if (vdo_is_read_only(parent->vdo)) {
+		vdo_launch_completion(parent);
+		return;
+	}
+
+	vdo_schedule_operation(zones->manager,
+			       VDO_ADMIN_STATE_RESUMING,
+			       resume_index,
+			       resume_hash_zone,
+			       NULL,
+			       parent);
+}
+
+/**
+ * get_hash_zone_statistics() - Add the statistics for this hash zone to the tally for all zones.
+ * @zone: The hash zone to query.
+ * @tally: The tally
+ */
+static void
+get_hash_zone_statistics(const struct hash_zone *zone, struct hash_lock_statistics *tally)
+{
+	const struct hash_lock_statistics *stats = &zone->statistics;
+
+	tally->dedupe_advice_valid += READ_ONCE(stats->dedupe_advice_valid);
+	tally->dedupe_advice_stale += READ_ONCE(stats->dedupe_advice_stale);
+	tally->concurrent_data_matches += READ_ONCE(stats->concurrent_data_matches);
+	tally->concurrent_hash_collisions += READ_ONCE(stats->concurrent_hash_collisions);
+	tally->curr_dedupe_queries += READ_ONCE(zone->active);
+}
+
+static void get_index_statistics(struct hash_zones *zones, struct index_statistics *stats)
+{
+	enum index_state state;
+	struct uds_index_stats index_stats;
+	int result;
+
+	spin_lock(&zones->lock);
+	state = zones->index_state;
+	spin_unlock(&zones->lock);
+
+	if (state != IS_OPENED)
+		return;
+
+	result = uds_get_index_session_stats(zones->index_session, &index_stats);
+	if (result != UDS_SUCCESS) {
+		uds_log_error_strerror(result, "Error reading index stats");
+		return;
+	}
+
+	stats->entries_indexed = index_stats.entries_indexed;
+	stats->posts_found = index_stats.posts_found;
+	stats->posts_not_found = index_stats.posts_not_found;
+	stats->queries_found = index_stats.queries_found;
+	stats->queries_not_found = index_stats.queries_not_found;
+	stats->updates_found = index_stats.updates_found;
+	stats->updates_not_found = index_stats.updates_not_found;
+	stats->entries_discarded = index_stats.entries_discarded;
+}
+
+/**
+ * vdo_get_dedupe_statistics() - Tally the statistics from all the hash zones and the UDS index.
+ * @hash_zones: The hash zones to query
+ *
+ * Return: The sum of the hash lock statistics from all hash zones plus the statistics from the UDS
+ *         index
+ */
+void vdo_get_dedupe_statistics(struct hash_zones *zones, struct vdo_statistics *stats)
+
+{
+	zone_count_t zone;
+
+	for (zone = 0; zone < zones->zone_count; zone++)
+		get_hash_zone_statistics(&zones->zones[zone], &stats->hash_lock);
+
+	get_index_statistics(zones, &stats->index);
+
+	/*
+	 * zones->timeouts gives the number of timeouts, and dedupe_context_busy gives the number
+	 * of queries not made because of earlier timeouts.
+	 */
+	stats->dedupe_advice_timeouts =
+		(atomic64_read(&zones->timeouts) + atomic64_read(&zones->dedupe_context_busy));
+}
+
+/**
+ * vdo_select_hash_zone() - Select the hash zone responsible for locking a given record name.
+ * @zones: The hash_zones from which to select.
+ * @name: The record name.
+ *
+ * Return: The hash zone responsible for the record name.
+ */
+struct hash_zone *
+vdo_select_hash_zone(struct hash_zones *zones, const struct uds_record_name *name)
+{
+	/*
+	 * Use a fragment of the record name as a hash code. Eight bits of hash should suffice
+	 * since the number of hash zones is small.
+	 * TODO: Verify that the first byte is independent enough.
+	 */
+	u32 hash = name->name[0];
+
+	/*
+	 * Scale the 8-bit hash fragment to a zone index by treating it as a binary fraction and
+	 * multiplying that by the zone count. If the hash is uniformly distributed over [0 ..
+	 * 2^8-1], then (hash * count / 2^8) should be uniformly distributed over [0 .. count-1].
+	 * The multiply and shift is much faster than a divide (modulus) on X86 CPUs.
+	 */
+	hash = (hash * zones->zone_count) >> 8;
+	return &zones->zones[hash];
+}
+
+/**
+ * dump_hash_lock() - Dump a compact description of hash_lock to the log if the lock is not on the
+ *                    free list.
+ * @lock: The hash lock to dump.
+ */
+static void dump_hash_lock(const struct hash_lock *lock)
+{
+	const char *state;
+
+	if (!list_empty(&lock->pool_node))
+		/* This lock is on the free list. */
+		return;
+
+	/*
+	 * Necessarily cryptic since we can log a lot of these. First three chars of state is
+	 * unambiguous. 'U' indicates a lock not registered in the map.
+	 */
+	state = get_hash_lock_state_name(lock->state);
+	uds_log_info("  hl %px: %3.3s %c%llu/%u rc=%u wc=%zu agt=%px",
+		     (const void *) lock, state, (lock->registered ? 'D' : 'U'),
+		     (unsigned long long) lock->duplicate.pbn,
+		     lock->duplicate.state, lock->reference_count,
+		     vdo_count_waiters(&lock->waiters), (void *) lock->agent);
+}
+
+static const char *index_state_to_string(struct hash_zones *zones, enum index_state state)
+{
+	if (!vdo_is_state_normal(&zones->state))
+		return SUSPENDED;
+
+	switch (state) {
+	case IS_CLOSED:
+		return zones->error_flag ? ERROR : CLOSED;
+	case IS_CHANGING:
+		return zones->index_target == IS_OPENED ? OPENING : CLOSING;
+	case IS_OPENED:
+		return READ_ONCE(zones->dedupe_flag) ? ONLINE : OFFLINE;
+	default:
+		return UNKNOWN;
+	}
+}
+
+/**
+ * vdo_dump_hash_zone() - Dump information about a hash zone to the log for debugging.
+ * @zone: The zone to dump.
+ */
+static void dump_hash_zone(const struct hash_zone *zone)
+{
+	data_vio_count_t i;
+
+	if (zone->hash_lock_map == NULL) {
+		uds_log_info("struct hash_zone %u: NULL map", zone->zone_number);
+		return;
+	}
+
+	uds_log_info("struct hash_zone %u: mapSize=%zu",
+		     zone->zone_number,
+		     vdo_pointer_map_size(zone->hash_lock_map));
+	for (i = 0; i < LOCK_POOL_CAPACITY; i++)
+		dump_hash_lock(&zone->lock_array[i]);
+}
+
+/**
+ * vdo_dump_hash_zones() - Dump information about the hash zones to the log for debugging.
+ * @zones: The zones to dump.
+ */
+void vdo_dump_hash_zones(struct hash_zones *zones)
+{
+	const char *state, *target;
+	zone_count_t zone;
+
+	spin_lock(&zones->lock);
+	state = index_state_to_string(zones, zones->index_state);
+	target = (zones->changing ? index_state_to_string(zones, zones->index_target) : NULL);
+	spin_unlock(&zones->lock);
+
+	uds_log_info("UDS index: state: %s", state);
+	if (target != NULL)
+		uds_log_info("UDS index: changing to state: %s", target);
+
+	for (zone = 0; zone < zones->zone_count; zone++)
+		dump_hash_zone(&zones->zones[zone]);
+}
+
+void vdo_set_dedupe_index_timeout_interval(unsigned int value)
+{
+	u64 alb_jiffies;
+
+	/* Arbitrary maximum value is two minutes */
+	if (value > 120000)
+		value = 120000;
+	/* Arbitrary minimum value is 2 jiffies */
+	alb_jiffies = msecs_to_jiffies(value);
+
+	if (alb_jiffies < 2) {
+		alb_jiffies = 2;
+		value = jiffies_to_msecs(alb_jiffies);
+	}
+	vdo_dedupe_index_timeout_interval = value;
+	vdo_dedupe_index_timeout_jiffies = alb_jiffies;
+}
+
+void vdo_set_dedupe_index_min_timer_interval(unsigned int value)
+{
+	u64 min_jiffies;
+
+	/* Arbitrary maximum value is one second */
+	if (value > 1000)
+		value = 1000;
+
+	/* Arbitrary minimum value is 2 jiffies */
+	min_jiffies = msecs_to_jiffies(value);
+
+	if (min_jiffies < 2) {
+		min_jiffies = 2;
+		value = jiffies_to_msecs(min_jiffies);
+	}
+
+	vdo_dedupe_index_min_timer_interval = value;
+	vdo_dedupe_index_min_timer_jiffies = min_jiffies;
+}
+
+/**
+ * acquire_context() - Acquire a dedupe context from a hash_zone if any are available.
+ * @zone: the hash zone
+ *
+ * Return: A dedupe_context or NULL if none are available
+ */
+static struct dedupe_context * __must_check acquire_context(struct hash_zone *zone)
+{
+	struct dedupe_context *context;
+	struct funnel_queue_entry *entry;
+
+	assert_in_hash_zone(zone, __func__);
+
+	if (!list_empty(&zone->available)) {
+		WRITE_ONCE(zone->active, zone->active + 1);
+		context = list_first_entry(&zone->available, struct dedupe_context, list_entry);
+		list_del_init(&context->list_entry);
+		return context;
+	}
+
+	entry = uds_funnel_queue_poll(zone->timed_out_complete);
+	return ((entry == NULL) ? NULL : container_of(entry, struct dedupe_context, queue_entry));
+}
+
+static void prepare_uds_request(struct uds_request *request,
+				struct data_vio *data_vio,
+				enum uds_request_type operation)
+{
+	request->record_name = data_vio->record_name;
+	request->type = operation;
+	if ((operation == UDS_POST) || (operation == UDS_UPDATE)) {
+		size_t offset = 0;
+		struct uds_record_data *encoding = &request->new_metadata;
+
+		encoding->data[offset++] = UDS_ADVICE_VERSION;
+		encoding->data[offset++] = data_vio->new_mapped.state;
+		put_unaligned_le64(data_vio->new_mapped.pbn, &encoding->data[offset]);
+		offset += sizeof(u64);
+		BUG_ON(offset != UDS_ADVICE_SIZE);
+	}
+}
+
+/*
+ * The index operation will inquire about data_vio.record_name, providing (if the operation is
+ * appropriate) advice from the data_vio's new_mapped fields. The advice found in the index (or
+ * NULL if none) will be returned via receive_data_vio_dedupe_advice(). dedupe_context.status is
+ * set to the return status code of any asynchronous index processing.
+ */
+static void query_index(struct data_vio *data_vio, enum uds_request_type operation)
+{
+	int result;
+	struct dedupe_context *context;
+	struct vdo *vdo = vdo_from_data_vio(data_vio);
+	struct hash_zone *zone = data_vio->hash_zone;
+
+	assert_data_vio_in_hash_zone(data_vio);
+
+	if (!READ_ONCE(vdo->hash_zones->dedupe_flag)) {
+		continue_data_vio(data_vio);
+		return;
+	}
+
+	context = acquire_context(zone);
+	if (context == NULL) {
+		atomic64_inc(&vdo->hash_zones->dedupe_context_busy);
+		continue_data_vio(data_vio);
+		return;
+	}
+
+	data_vio->dedupe_context = context;
+	context->requestor = data_vio;
+	context->submission_jiffies = jiffies;
+	prepare_uds_request(&context->request, data_vio, operation);
+	atomic_set(&context->state, DEDUPE_CONTEXT_PENDING);
+	list_add_tail(&context->list_entry, &zone->pending);
+	start_expiration_timer(context);
+	result = uds_launch_request(&context->request);
+	if (result != UDS_SUCCESS) {
+		context->request.status = result;
+		finish_index_operation(&context->request);
+	}
+}
+
+static void set_target_state(struct hash_zones *zones,
+			     enum index_state target,
+			     bool change_dedupe,
+			     bool dedupe,
+			     bool set_create)
+{
+	const char *old_state, *new_state;
+
+	spin_lock(&zones->lock);
+	old_state = index_state_to_string(zones, zones->index_target);
+	if (change_dedupe)
+		WRITE_ONCE(zones->dedupe_flag, dedupe);
+
+	if (set_create)
+		zones->create_flag = true;
+
+	zones->index_target = target;
+	launch_dedupe_state_change(zones);
+	new_state = index_state_to_string(zones, zones->index_target);
+	spin_unlock(&zones->lock);
+
+	if (old_state != new_state)
+		uds_log_info("Setting UDS index target state to %s", new_state);
+}
+
+const char *vdo_get_dedupe_index_state_name(struct hash_zones *zones)
+{
+	const char *state;
+
+	spin_lock(&zones->lock);
+	state = index_state_to_string(zones, zones->index_state);
+	spin_unlock(&zones->lock);
+
+	return state;
+}
+
+/* Handle a dmsetup message relevant to the index. */
+int vdo_message_dedupe_index(struct hash_zones *zones, const char *name)
+{
+	if (strcasecmp(name, "index-close") == 0) {
+		set_target_state(zones, IS_CLOSED, false, false, false);
+		return 0;
+	} else if (strcasecmp(name, "index-create") == 0) {
+		set_target_state(zones, IS_OPENED, false, false, true);
+		return 0;
+	} else if (strcasecmp(name, "index-disable") == 0) {
+		set_target_state(zones, IS_OPENED, true, false, false);
+		return 0;
+	} else if (strcasecmp(name, "index-enable") == 0) {
+		set_target_state(zones, IS_OPENED, true, true, false);
+		return 0;
+	}
+	return -EINVAL;
+}
+
+int vdo_add_dedupe_index_sysfs(struct hash_zones *zones)
+{
+	int result = kobject_add(&zones->dedupe_directory,
+				 &zones->completion.vdo->vdo_directory,
+				 "dedupe");
+
+	if (result == 0)
+		vdo_set_admin_state_code(&zones->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+
+	return result;
+}
+
+/* If create_flag, create a new index without first attempting to load an existing index. */
+void vdo_start_dedupe_index(struct hash_zones *zones, bool create_flag)
+{
+	set_target_state(zones, IS_OPENED, true, true, create_flag);
+}
diff --git a/drivers/md/dm-vdo/dedupe.h b/drivers/md/dm-vdo/dedupe.h
new file mode 100644
index 00000000000..d5730619694
--- /dev/null
+++ b/drivers/md/dm-vdo/dedupe.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_DEDUPE_H
+#define VDO_DEDUPE_H
+
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/timer.h>
+
+#include "uds.h"
+
+#include "admin-state.h"
+#include "constants.h"
+#include "statistics.h"
+#include "types.h"
+#include "wait-queue.h"
+
+struct dedupe_context {
+	struct hash_zone *zone;
+	struct uds_request request;
+	struct list_head list_entry;
+	struct funnel_queue_entry queue_entry;
+	u64 submission_jiffies;
+	struct data_vio *requestor;
+	atomic_t state;
+};
+
+struct hash_lock;
+
+struct hash_zone {
+	/* Which hash zone this is */
+	zone_count_t zone_number;
+
+	/* The administrative state of the zone */
+	struct admin_state state;
+
+	/* The thread ID for this zone */
+	thread_id_t thread_id;
+
+	/* Mapping from record name fields to hash_locks */
+	struct pointer_map *hash_lock_map;
+
+	/* List containing all unused hash_locks */
+	struct list_head lock_pool;
+
+	/*
+	 * Statistics shared by all hash locks in this zone. Only modified on the hash zone thread,
+	 * but queried by other threads.
+	 */
+	struct hash_lock_statistics statistics;
+
+	/* Array of all hash_locks */
+	struct hash_lock *lock_array;
+
+	/* These fields are used to manage the dedupe contexts */
+	struct list_head available;
+	struct list_head pending;
+	struct funnel_queue *timed_out_complete;
+	struct timer_list timer;
+	struct vdo_completion completion;
+	unsigned int active;
+	atomic_t timer_state;
+
+	/* The dedupe contexts for querying the index from this zone */
+	struct dedupe_context contexts[MAXIMUM_VDO_USER_VIOS];
+};
+
+struct hash_zones;
+
+struct pbn_lock * __must_check vdo_get_duplicate_lock(struct data_vio *data_vio);
+
+void vdo_acquire_hash_lock(struct vdo_completion *completion);
+void vdo_continue_hash_lock(struct vdo_completion *completion);
+void vdo_release_hash_lock(struct data_vio *data_vio);
+void vdo_clean_failed_hash_lock(struct data_vio *data_vio);
+void vdo_share_compressed_write_lock(struct data_vio *data_vio, struct pbn_lock *pbn_lock);
+
+int __must_check vdo_make_hash_zones(struct vdo *vdo, struct hash_zones **zones_ptr);
+
+void vdo_free_hash_zones(struct hash_zones *zones);
+
+void vdo_drain_hash_zones(struct hash_zones *zones, struct vdo_completion *parent);
+
+void vdo_get_dedupe_statistics(struct hash_zones *zones, struct vdo_statistics *stats);
+
+struct hash_zone * __must_check
+vdo_select_hash_zone(struct hash_zones *zones, const struct uds_record_name *name);
+
+void vdo_dump_hash_zones(struct hash_zones *zones);
+
+const char *vdo_get_dedupe_index_state_name(struct hash_zones *zones);
+
+u64 vdo_get_dedupe_index_timeout_count(struct hash_zones *zones);
+
+int vdo_message_dedupe_index(struct hash_zones *zones, const char *name);
+
+int vdo_add_dedupe_index_sysfs(struct hash_zones *zones);
+
+void vdo_start_dedupe_index(struct hash_zones *zones, bool create_flag);
+
+void vdo_resume_hash_zones(struct hash_zones *zones, struct vdo_completion *parent);
+
+void vdo_finish_dedupe_index(struct hash_zones *zones);
+
+/* Interval (in milliseconds) from submission until switching to fast path and skipping UDS. */
+extern unsigned int vdo_dedupe_index_timeout_interval;
+
+/*
+ * Minimum time interval (in milliseconds) between timer invocations to check for requests waiting
+ * for UDS that should now time out.
+ */
+extern unsigned int vdo_dedupe_index_min_timer_interval;
+
+void vdo_set_dedupe_index_timeout_interval(unsigned int value);
+void vdo_set_dedupe_index_min_timer_interval(unsigned int value);
+
+#endif /* VDO_DEDUPE_H */
diff --git a/drivers/md/dm-vdo/dump.c b/drivers/md/dm-vdo/dump.c
new file mode 100644
index 00000000000..0943357f98b
--- /dev/null
+++ b/drivers/md/dm-vdo/dump.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "dump.h"
+
+#include <linux/module.h>
+
+#include "memory-alloc.h"
+#include "string-utils.h"
+
+#include "constants.h"
+#include "data-vio.h"
+#include "dedupe.h"
+#include "io-submitter.h"
+#include "logger.h"
+#include "types.h"
+#include "vdo.h"
+#include "work-queue.h"
+
+enum dump_options {
+	/* Work queues */
+	SHOW_QUEUES,
+	/* Memory pools */
+	SHOW_VIO_POOL,
+	/* Others */
+	SHOW_VDO_STATUS,
+	/* This one means an option overrides the "default" choices, instead of altering them. */
+	SKIP_DEFAULT
+};
+
+enum dump_option_flags {
+	/* Work queues */
+	FLAG_SHOW_QUEUES = (1 << SHOW_QUEUES),
+	/* Memory pools */
+	FLAG_SHOW_VIO_POOL = (1 << SHOW_VIO_POOL),
+	/* Others */
+	FLAG_SHOW_VDO_STATUS = (1 << SHOW_VDO_STATUS),
+	/* Special */
+	FLAG_SKIP_DEFAULT = (1 << SKIP_DEFAULT)
+};
+
+enum {
+	FLAGS_ALL_POOLS = (FLAG_SHOW_VIO_POOL),
+	DEFAULT_DUMP_FLAGS = (FLAG_SHOW_QUEUES | FLAG_SHOW_VDO_STATUS)
+};
+
+static inline bool is_arg_string(const char *arg, const char *this_option)
+{
+	/* convention seems to be case-independent options */
+	return strncasecmp(arg, this_option, strlen(this_option)) == 0;
+}
+
+static void do_dump(struct vdo *vdo, unsigned int dump_options_requested, const char *why)
+{
+	u32 active, maximum;
+	s64 outstanding;
+
+	uds_log_info("%s dump triggered via %s", UDS_LOGGING_MODULE_NAME, why);
+	active = get_data_vio_pool_active_requests(vdo->data_vio_pool);
+	maximum = get_data_vio_pool_maximum_requests(vdo->data_vio_pool);
+	outstanding = (atomic64_read(&vdo->stats.bios_submitted) -
+		       atomic64_read(&vdo->stats.bios_completed));
+	uds_log_info("%u device requests outstanding (max %u), %lld bio requests outstanding, device '%s'",
+		     active,
+		     maximum,
+		     outstanding,
+		     vdo_get_device_name(vdo->device_config->owning_target));
+	if (((dump_options_requested & FLAG_SHOW_QUEUES) != 0) && (vdo->threads != NULL)) {
+		thread_id_t id;
+
+		for (id = 0; id < vdo->thread_config.thread_count; id++)
+			vdo_dump_work_queue(vdo->threads[id].queue);
+	}
+
+	vdo_dump_hash_zones(vdo->hash_zones);
+	dump_data_vio_pool(vdo->data_vio_pool, (dump_options_requested & FLAG_SHOW_VIO_POOL) != 0);
+	if ((dump_options_requested & FLAG_SHOW_VDO_STATUS) != 0)
+		vdo_dump_status(vdo);
+
+	uds_report_memory_usage();
+	uds_log_info("end of %s dump", UDS_LOGGING_MODULE_NAME);
+}
+
+static int
+parse_dump_options(unsigned int argc, char *const *argv, unsigned int *dump_options_requested_ptr)
+{
+	unsigned int dump_options_requested = 0;
+
+	static const struct {
+		const char *name;
+		unsigned int flags;
+	} option_names[] = {
+		{ "viopool", FLAG_SKIP_DEFAULT | FLAG_SHOW_VIO_POOL },
+		{ "vdo", FLAG_SKIP_DEFAULT | FLAG_SHOW_VDO_STATUS },
+		{ "pools", FLAG_SKIP_DEFAULT | FLAGS_ALL_POOLS },
+		{ "queues", FLAG_SKIP_DEFAULT | FLAG_SHOW_QUEUES },
+		{ "threads", FLAG_SKIP_DEFAULT | FLAG_SHOW_QUEUES },
+		{ "default", FLAG_SKIP_DEFAULT | DEFAULT_DUMP_FLAGS },
+		{ "all", ~0 },
+	};
+
+	bool options_okay = true;
+	unsigned int i;
+
+	for (i = 1; i < argc; i++) {
+		unsigned int j;
+
+		for (j = 0; j < ARRAY_SIZE(option_names); j++) {
+			if (is_arg_string(argv[i], option_names[j].name)) {
+				dump_options_requested |= option_names[j].flags;
+				break;
+			}
+		}
+		if (j == ARRAY_SIZE(option_names)) {
+			uds_log_warning("dump option name '%s' unknown", argv[i]);
+			options_okay = false;
+		}
+	}
+	if (!options_okay)
+		return -EINVAL;
+	if ((dump_options_requested & FLAG_SKIP_DEFAULT) == 0)
+		dump_options_requested |= DEFAULT_DUMP_FLAGS;
+	*dump_options_requested_ptr = dump_options_requested;
+	return 0;
+}
+
+/* Dump as specified by zero or more string arguments. */
+int vdo_dump(struct vdo *vdo, unsigned int argc, char *const *argv, const char *why)
+{
+	unsigned int dump_options_requested = 0;
+	int result = parse_dump_options(argc, argv, &dump_options_requested);
+
+	if (result != 0)
+		return result;
+
+	do_dump(vdo, dump_options_requested, why);
+	return 0;
+}
+
+/* Dump everything we know how to dump */
+void vdo_dump_all(struct vdo *vdo, const char *why)
+{
+	do_dump(vdo, ~0, why);
+}
+
+/*
+ * Dump out the data_vio waiters on a wait queue.
+ * wait_on should be the label to print for queue (e.g. logical or physical)
+ */
+static void dump_vio_waiters(struct wait_queue *queue, char *wait_on)
+{
+	struct waiter *waiter, *first = vdo_get_first_waiter(queue);
+	struct data_vio *data_vio;
+
+	if (first == NULL)
+		return;
+
+	data_vio = waiter_as_data_vio(first);
+
+	uds_log_info("      %s is locked. Waited on by: vio %px pbn %llu lbn %llu d-pbn %llu lastOp %s",
+		     wait_on,
+		     data_vio,
+		     data_vio->allocation.pbn,
+		     data_vio->logical.lbn,
+		     data_vio->duplicate.pbn,
+		     get_data_vio_operation_name(data_vio));
+
+	for (waiter = first->next_waiter; waiter != first; waiter = waiter->next_waiter) {
+		data_vio = waiter_as_data_vio(waiter);
+		uds_log_info("     ... and : vio %px pbn %llu lbn %llu d-pbn %llu lastOp %s",
+			     data_vio,
+			     data_vio->allocation.pbn,
+			     data_vio->logical.lbn,
+			     data_vio->duplicate.pbn,
+			     get_data_vio_operation_name(data_vio));
+	}
+}
+
+/*
+ * Encode various attributes of a data_vio as a string of one-character flags. This encoding is for
+ * logging brevity:
+ *
+ * R => vio completion result not VDO_SUCCESS
+ * W => vio is on a wait queue
+ * D => vio is a duplicate
+ * p => vio is a partial block operation
+ * z => vio is a zero block
+ * d => vio is a discard
+ *
+ * The common case of no flags set will result in an empty, null-terminated buffer. If any flags
+ * are encoded, the first character in the string will be a space character.
+ */
+static void encode_vio_dump_flags(struct data_vio *data_vio, char buffer[8])
+{
+	char *p_flag = buffer;
+	*p_flag++ = ' ';
+	if (data_vio->vio.completion.result != VDO_SUCCESS)
+		*p_flag++ = 'R';
+	if (data_vio->waiter.next_waiter != NULL)
+		*p_flag++ = 'W';
+	if (data_vio->is_duplicate)
+		*p_flag++ = 'D';
+	if (data_vio->is_partial)
+		*p_flag++ = 'p';
+	if (data_vio->is_zero)
+		*p_flag++ = 'z';
+	if (data_vio->remaining_discard > 0)
+		*p_flag++ = 'd';
+	if (p_flag == &buffer[1])
+		/* No flags, so remove the blank space. */
+		p_flag = buffer;
+	*p_flag = '\0';
+}
+
+/* Implements buffer_dump_function. */
+void dump_data_vio(void *data)
+{
+	struct data_vio *data_vio = (struct data_vio *) data;
+
+	/*
+	 * This just needs to be big enough to hold a queue (thread) name and a function name (plus
+	 * a separator character and NUL). The latter is limited only by taste.
+	 *
+	 * In making this static, we're assuming only one "dump" will run at a time. If more than
+	 * one does run, the log output will be garbled anyway.
+	 */
+	static char vio_completion_dump_buffer[100 + MAX_VDO_WORK_QUEUE_NAME_LEN];
+	/* Another static buffer... log10(256) = 2.408+, round up: */
+	enum { DIGITS_PER_U64 = 1 + sizeof(u64) * 2409 / 1000 };
+
+	static char vio_block_number_dump_buffer[sizeof("P L D") + 3 * DIGITS_PER_U64];
+	static char vio_flush_generation_buffer[sizeof(" FG") + DIGITS_PER_U64];
+	static char flags_dump_buffer[8];
+
+	/*
+	 * We're likely to be logging a couple thousand of these lines, and in some circumstances
+	 * syslogd may have trouble keeping up, so keep it BRIEF rather than user-friendly.
+	 */
+	vdo_dump_completion_to_buffer(&data_vio->vio.completion,
+				      vio_completion_dump_buffer,
+				      sizeof(vio_completion_dump_buffer));
+	if (data_vio->is_duplicate)
+		snprintf(vio_block_number_dump_buffer,
+			 sizeof(vio_block_number_dump_buffer),
+			 "P%llu L%llu D%llu",
+			 data_vio->allocation.pbn,
+			 data_vio->logical.lbn,
+			 data_vio->duplicate.pbn);
+	else if (data_vio_has_allocation(data_vio))
+		snprintf(vio_block_number_dump_buffer,
+			 sizeof(vio_block_number_dump_buffer),
+			 "P%llu L%llu",
+			 data_vio->allocation.pbn,
+			 data_vio->logical.lbn);
+	else
+		snprintf(vio_block_number_dump_buffer,
+			 sizeof(vio_block_number_dump_buffer),
+			 "L%llu",
+			 data_vio->logical.lbn);
+
+	if (data_vio->flush_generation != 0)
+		snprintf(vio_flush_generation_buffer,
+			 sizeof(vio_flush_generation_buffer),
+			 " FG%llu",
+			 data_vio->flush_generation);
+	else
+		vio_flush_generation_buffer[0] = 0;
+
+	encode_vio_dump_flags(data_vio, flags_dump_buffer);
+
+	uds_log_info("	vio %px %s%s %s %s%s",
+		     data_vio,
+		     vio_block_number_dump_buffer,
+		     vio_flush_generation_buffer,
+		     get_data_vio_operation_name(data_vio),
+		     vio_completion_dump_buffer,
+		     flags_dump_buffer);
+	/*
+	 * might want info on: wantUDSAnswer / operation / status
+	 * might want info on: bio / bios_merged
+	 */
+
+	dump_vio_waiters(&data_vio->logical.waiters, "lbn");
+
+	/* might want to dump more info from vio here */
+}
diff --git a/drivers/md/dm-vdo/dump.h b/drivers/md/dm-vdo/dump.h
new file mode 100644
index 00000000000..74ade5eab07
--- /dev/null
+++ b/drivers/md/dm-vdo/dump.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_DUMP_H
+#define VDO_DUMP_H
+
+#include "types.h"
+
+int vdo_dump(struct vdo *vdo, unsigned int argc, char *const *argv, const char *why);
+
+void vdo_dump_all(struct vdo *vdo, const char *why);
+
+void dump_data_vio(void *data);
+
+#endif /* VDO_DUMP_H */
diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c
new file mode 100644
index 00000000000..d4425686092
--- /dev/null
+++ b/drivers/md/dm-vdo/encodings.c
@@ -0,0 +1,1523 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "encodings.h"
+
+#include <linux/log2.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "constants.h"
+#include "release-versions.h"
+#include "status-codes.h"
+#include "types.h"
+
+struct geometry_block {
+	char magic_number[VDO_GEOMETRY_MAGIC_NUMBER_SIZE];
+	struct packed_header header;
+	u32 checksum;
+} __packed;
+
+static const struct header GEOMETRY_BLOCK_HEADER_5_0 = {
+	.id = VDO_GEOMETRY_BLOCK,
+	.version = {
+		.major_version = 5,
+		.minor_version = 0,
+	},
+	/*
+	 * Note: this size isn't just the payload size following the header, like it is everywhere
+	 * else in VDO.
+	 */
+	.size = sizeof(struct geometry_block) + sizeof(struct volume_geometry),
+};
+
+static const struct header GEOMETRY_BLOCK_HEADER_4_0 = {
+	.id = VDO_GEOMETRY_BLOCK,
+	.version = {
+		.major_version = 4,
+		.minor_version = 0,
+	},
+	/*
+	 * Note: this size isn't just the payload size following the header, like it is everywhere
+	 * else in VDO.
+	 */
+	.size = sizeof(struct geometry_block) + sizeof(struct volume_geometry_4_0),
+};
+
+const u8 VDO_GEOMETRY_MAGIC_NUMBER[VDO_GEOMETRY_MAGIC_NUMBER_SIZE + 1] = "dmvdo001";
+
+static const release_version_number_t COMPATIBLE_RELEASE_VERSIONS[] = {
+	VDO_MAGNESIUM_RELEASE_VERSION_NUMBER,
+	VDO_ALUMINUM_RELEASE_VERSION_NUMBER,
+};
+
+enum {
+	PAGE_HEADER_4_1_SIZE = 8 + 8 + 8 + 1 + 1 + 1 + 1,
+};
+
+static const struct version_number BLOCK_MAP_4_1 = {
+	.major_version = 4,
+	.minor_version = 1,
+};
+
+const struct header VDO_BLOCK_MAP_HEADER_2_0 = {
+	.id = VDO_BLOCK_MAP,
+	.version = {
+		.major_version = 2,
+		.minor_version = 0,
+	},
+	.size = sizeof(struct block_map_state_2_0),
+};
+
+const struct header VDO_RECOVERY_JOURNAL_HEADER_7_0 = {
+	.id = VDO_RECOVERY_JOURNAL,
+	.version = {
+			.major_version = 7,
+			.minor_version = 0,
+		},
+	.size = sizeof(struct recovery_journal_state_7_0),
+};
+
+const struct header VDO_SLAB_DEPOT_HEADER_2_0 = {
+	.id = VDO_SLAB_DEPOT,
+	.version = {
+		.major_version = 2,
+		.minor_version = 0,
+	},
+	.size = sizeof(struct slab_depot_state_2_0),
+};
+
+const struct header VDO_LAYOUT_HEADER_3_0 = {
+	.id = VDO_LAYOUT,
+	.version = {
+		.major_version = 3,
+		.minor_version = 0,
+	},
+	.size = sizeof(struct layout_3_0) + (sizeof(struct partition_3_0) * VDO_PARTITION_COUNT),
+};
+
+static const enum partition_id REQUIRED_PARTITIONS[] = {
+	VDO_BLOCK_MAP_PARTITION,
+	VDO_SLAB_DEPOT_PARTITION,
+	VDO_RECOVERY_JOURNAL_PARTITION,
+	VDO_SLAB_SUMMARY_PARTITION,
+};
+
+/*
+ * The current version for the data encoded in the super block. This must be changed any time there
+ * is a change to encoding of the component data of any VDO component.
+ */
+static const struct version_number VDO_COMPONENT_DATA_41_0 = {
+	.major_version = 41,
+	.minor_version = 0,
+};
+
+const struct version_number VDO_VOLUME_VERSION_67_0 = {
+	.major_version = 67,
+	.minor_version = 0,
+};
+
+static const struct header SUPER_BLOCK_HEADER_12_0 = {
+	.id = VDO_SUPER_BLOCK,
+	.version = {
+			.major_version = 12,
+			.minor_version = 0,
+		},
+
+	/* This is the minimum size, if the super block contains no components. */
+	.size = VDO_SUPER_BLOCK_FIXED_SIZE - VDO_ENCODED_HEADER_SIZE,
+};
+
+/**
+ * validate_version() - Check whether a version matches an expected version.
+ * @expected_version: The expected version.
+ * @actual_version: The version being validated.
+ * @component_name: The name of the component or the calling function (for error logging).
+ *
+ * Logs an error describing a mismatch.
+ *
+ * Return: VDO_SUCCESS             if the versions are the same,
+ *         VDO_UNSUPPORTED_VERSION if the versions don't match.
+ */
+static int __must_check validate_version(struct version_number expected_version,
+					 struct version_number actual_version,
+					 const char *component_name)
+{
+	if (!vdo_are_same_version(expected_version, actual_version))
+		return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION,
+					      "%s version mismatch, expected %d.%d, got %d.%d",
+					      component_name,
+					      expected_version.major_version,
+					      expected_version.minor_version,
+					      actual_version.major_version,
+					      actual_version.minor_version);
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_validate_header() - Check whether a header matches expectations.
+ * @expected_header: The expected header.
+ * @actual_header: The header being validated.
+ * @exact_size: If true, the size fields of the two headers must be the same, otherwise it is
+ *              required that actual_header.size >= expected_header.size.
+ * @name: The name of the component or the calling function (for error logging).
+ *
+ * Logs an error describing the first mismatch found.
+ *
+ * Return: VDO_SUCCESS             if the header meets expectations,
+ *         VDO_INCORRECT_COMPONENT if the component ids don't match,
+ *         VDO_UNSUPPORTED_VERSION if the versions or sizes don't match.
+ */
+int vdo_validate_header(const struct header *expected_header,
+			const struct header *actual_header,
+			bool exact_size,
+			const char *name)
+{
+	int result;
+
+	if (expected_header->id != actual_header->id)
+		return uds_log_error_strerror(VDO_INCORRECT_COMPONENT,
+					      "%s ID mismatch, expected %d, got %d",
+					      name,
+					      expected_header->id,
+					      actual_header->id);
+
+	result = validate_version(expected_header->version, actual_header->version, name);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	if ((expected_header->size > actual_header->size) ||
+	    (exact_size && (expected_header->size < actual_header->size)))
+		return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION,
+					      "%s size mismatch, expected %zu, got %zu",
+					      name,
+					      expected_header->size,
+					      actual_header->size);
+
+	return VDO_SUCCESS;
+}
+
+static void encode_version_number(u8 *buffer, size_t *offset, struct version_number version)
+{
+	struct packed_version_number packed = vdo_pack_version_number(version);
+
+	memcpy(buffer + *offset, &packed, sizeof(packed));
+	*offset += sizeof(packed);
+}
+
+void vdo_encode_header(u8 *buffer, size_t *offset, const struct header *header)
+{
+	struct packed_header packed = vdo_pack_header(header);
+
+	memcpy(buffer + *offset, &packed, sizeof(packed));
+	*offset += sizeof(packed);
+}
+
+static void decode_version_number(u8 *buffer, size_t *offset, struct version_number *version)
+{
+	struct packed_version_number packed;
+
+	memcpy(&packed, buffer + *offset, sizeof(packed));
+	*offset += sizeof(packed);
+	*version = vdo_unpack_version_number(packed);
+}
+
+void vdo_decode_header(u8 *buffer, size_t *offset, struct header *header)
+{
+	struct packed_header packed;
+
+	memcpy(&packed, buffer + *offset, sizeof(packed));
+	*offset += sizeof(packed);
+
+	*header = vdo_unpack_header(&packed);
+}
+
+/**
+ * is_loadable_release_version() - Determine whether the supplied release version can be understood
+ *                                 by the VDO code.
+ * @version: The release version number to check.
+ *
+ * Return: True if the given version can be loaded.
+ */
+static inline bool is_loadable_release_version(release_version_number_t version)
+{
+	unsigned int i;
+
+	if (version == VDO_CURRENT_RELEASE_VERSION_NUMBER)
+		return true;
+
+	for (i = 0; i < ARRAY_SIZE(COMPATIBLE_RELEASE_VERSIONS); i++)
+		if (version == COMPATIBLE_RELEASE_VERSIONS[i])
+			return true;
+
+	return false;
+}
+
+/**
+ * decode_volume_geometry() - Decode the on-disk representation of a volume geometry from a buffer.
+ * @buffer: A buffer to decode from.
+ * @offset: The offset in the buffer at which to decode.
+ * @geometry: The structure to receive the decoded fields.
+ * @version: The geometry block version to decode.
+ */
+static void
+decode_volume_geometry(u8 *buffer, size_t *offset, struct volume_geometry *geometry, u32 version)
+{
+	release_version_number_t release_version;
+	enum volume_region_id id;
+	nonce_t nonce;
+	block_count_t bio_offset = 0;
+	u32 mem;
+	bool sparse;
+
+	decode_u32_le(buffer, offset, &release_version);
+	decode_u64_le(buffer, offset, &nonce);
+	geometry->release_version = release_version;
+	geometry->nonce = nonce;
+
+	memcpy((unsigned char *) &geometry->uuid, buffer + *offset, sizeof(uuid_t));
+	*offset += sizeof(uuid_t);
+
+	if (version > 4)
+		decode_u64_le(buffer, offset, &bio_offset);
+	geometry->bio_offset = bio_offset;
+
+	for (id = 0; id < VDO_VOLUME_REGION_COUNT; id++) {
+		physical_block_number_t start_block;
+		enum volume_region_id saved_id;
+
+		decode_u32_le(buffer, offset, &saved_id);
+		decode_u64_le(buffer, offset, &start_block);
+
+		geometry->regions[id] = (struct volume_region) {
+			.id = saved_id,
+			.start_block = start_block,
+		};
+	}
+
+	decode_u32_le(buffer, offset, &mem);
+	*offset += sizeof(u32);
+	sparse = buffer[(*offset)++];
+
+	geometry->index_config = (struct index_config) {
+		.mem = mem,
+		.sparse = sparse,
+	};
+}
+
+/**
+ * vdo_parse_geometry_block() - Decode and validate an encoded geometry block.
+ * @block: The encoded geometry block.
+ * @geometry: The structure to receive the decoded fields.
+ */
+int __must_check vdo_parse_geometry_block(u8 *block, struct volume_geometry *geometry)
+{
+	u32 checksum, saved_checksum;
+	struct header header;
+	size_t offset = 0;
+	int result;
+
+	if (memcmp(block, VDO_GEOMETRY_MAGIC_NUMBER, VDO_GEOMETRY_MAGIC_NUMBER_SIZE) != 0)
+		return VDO_BAD_MAGIC;
+	offset += VDO_GEOMETRY_MAGIC_NUMBER_SIZE;
+
+	vdo_decode_header(block, &offset, &header);
+	if (header.version.major_version <= 4)
+		result = vdo_validate_header(&GEOMETRY_BLOCK_HEADER_4_0, &header, true, __func__);
+	else
+		result = vdo_validate_header(&GEOMETRY_BLOCK_HEADER_5_0, &header, true, __func__);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	decode_volume_geometry(block, &offset, geometry, header.version.major_version);
+
+	result = ASSERT(header.size == offset + sizeof(u32),
+			"should have decoded up to the geometry checksum");
+	if (result != VDO_SUCCESS)
+		return result;
+
+	/* Decode and verify the checksum. */
+	checksum = vdo_crc32(block, offset);
+	decode_u32_le(block, &offset, &saved_checksum);
+
+	if (!is_loadable_release_version(geometry->release_version))
+		return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION,
+					      "release version %d cannot be loaded",
+					      geometry->release_version);
+
+	return ((checksum == saved_checksum) ? VDO_SUCCESS : VDO_CHECKSUM_MISMATCH);
+}
+
+struct block_map_page *vdo_format_block_map_page(void *buffer,
+						 nonce_t nonce,
+						 physical_block_number_t pbn,
+						 bool initialized)
+{
+	struct block_map_page *page = (struct block_map_page *) buffer;
+
+	memset(buffer, 0, VDO_BLOCK_SIZE);
+	page->version = vdo_pack_version_number(BLOCK_MAP_4_1);
+	page->header.nonce = __cpu_to_le64(nonce);
+	page->header.pbn = __cpu_to_le64(pbn);
+	page->header.initialized = initialized;
+	return page;
+}
+
+enum block_map_page_validity
+vdo_validate_block_map_page(struct block_map_page *page,
+			    nonce_t nonce,
+			    physical_block_number_t pbn)
+{
+	STATIC_ASSERT_SIZEOF(struct block_map_page_header, PAGE_HEADER_4_1_SIZE);
+
+	if (!vdo_are_same_version(BLOCK_MAP_4_1, vdo_unpack_version_number(page->version)) ||
+	    !page->header.initialized ||
+	    (nonce != __le64_to_cpu(page->header.nonce)))
+		return VDO_BLOCK_MAP_PAGE_INVALID;
+
+	if (pbn != vdo_get_block_map_page_pbn(page))
+		return VDO_BLOCK_MAP_PAGE_BAD;
+
+	return VDO_BLOCK_MAP_PAGE_VALID;
+}
+
+static int
+decode_block_map_state_2_0(u8 *buffer, size_t *offset, struct block_map_state_2_0 *state)
+{
+	size_t initial_offset;
+	block_count_t flat_page_count, root_count;
+	physical_block_number_t flat_page_origin, root_origin;
+	struct header header;
+	int result;
+
+	vdo_decode_header(buffer, offset, &header);
+	result = vdo_validate_header(&VDO_BLOCK_MAP_HEADER_2_0, &header, true, __func__);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	initial_offset = *offset;
+
+	decode_u64_le(buffer, offset, &flat_page_origin);
+	result = ASSERT(flat_page_origin == VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN,
+			"Flat page origin must be %u (recorded as %llu)",
+			VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN,
+			(unsigned long long) state->flat_page_origin);
+	if (result != UDS_SUCCESS)
+		return result;
+
+	decode_u64_le(buffer, offset, &flat_page_count);
+	result = ASSERT(flat_page_count == 0,
+			"Flat page count must be 0 (recorded as %llu)",
+			(unsigned long long) state->flat_page_count);
+	if (result != UDS_SUCCESS)
+		return result;
+
+	decode_u64_le(buffer, offset, &root_origin);
+	decode_u64_le(buffer, offset, &root_count);
+
+	result = ASSERT(VDO_BLOCK_MAP_HEADER_2_0.size == *offset - initial_offset,
+			"decoded block map component size must match header size");
+	if (result != VDO_SUCCESS)
+		return result;
+
+	*state = (struct block_map_state_2_0) {
+		.flat_page_origin = flat_page_origin,
+		.flat_page_count = flat_page_count,
+		.root_origin = root_origin,
+		.root_count = root_count,
+	};
+
+	return VDO_SUCCESS;
+}
+
+static void
+encode_block_map_state_2_0(u8 *buffer, size_t *offset, struct block_map_state_2_0 state)
+{
+	size_t initial_offset;
+
+	vdo_encode_header(buffer, offset, &VDO_BLOCK_MAP_HEADER_2_0);
+
+	initial_offset = *offset;
+	encode_u64_le(buffer, offset, state.flat_page_origin);
+	encode_u64_le(buffer, offset, state.flat_page_count);
+	encode_u64_le(buffer, offset, state.root_origin);
+	encode_u64_le(buffer, offset, state.root_count);
+
+	ASSERT_LOG_ONLY(VDO_BLOCK_MAP_HEADER_2_0.size == *offset - initial_offset,
+			"encoded block map component size must match header size");
+}
+
+/**
+ * vdo_compute_new_forest_pages() - Compute the number of pages which must be allocated at each
+ *                                  level in order to grow the forest to a new number of entries.
+ * @entries: The new number of entries the block map must address.
+ *
+ * Return: The total number of non-leaf pages required.
+ */
+block_count_t vdo_compute_new_forest_pages(root_count_t root_count,
+					   struct boundary *old_sizes,
+					   block_count_t entries,
+					   struct boundary *new_sizes)
+{
+	page_count_t leaf_pages = max(vdo_compute_block_map_page_count(entries), 1U);
+	page_count_t level_size = DIV_ROUND_UP(leaf_pages, root_count);
+	block_count_t total_pages = 0;
+	height_t height;
+
+	for (height = 0; height < VDO_BLOCK_MAP_TREE_HEIGHT; height++) {
+		block_count_t new_pages;
+
+		level_size = DIV_ROUND_UP(level_size, VDO_BLOCK_MAP_ENTRIES_PER_PAGE);
+		new_sizes->levels[height] = level_size;
+		new_pages = level_size;
+		if (old_sizes != NULL)
+			new_pages -= old_sizes->levels[height];
+		total_pages += (new_pages * root_count);
+	}
+
+	return total_pages;
+}
+
+/**
+ * encode_recovery_journal_state_7_0() - Encode the state of a recovery journal.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static void
+encode_recovery_journal_state_7_0(u8 *buffer,
+				  size_t *offset,
+				  struct recovery_journal_state_7_0 state)
+{
+	size_t initial_offset;
+
+	vdo_encode_header(buffer, offset, &VDO_RECOVERY_JOURNAL_HEADER_7_0);
+
+	initial_offset = *offset;
+	encode_u64_le(buffer, offset, state.journal_start);
+	encode_u64_le(buffer, offset, state.logical_blocks_used);
+	encode_u64_le(buffer, offset, state.block_map_data_blocks);
+
+	ASSERT_LOG_ONLY(VDO_RECOVERY_JOURNAL_HEADER_7_0.size == *offset - initial_offset,
+			"encoded recovery journal component size must match header size");
+}
+
+/**
+ * decode_recovery_journal_state_7_0() - Decode the state of a recovery journal saved in a buffer.
+ * @buffer: The buffer containing the saved state.
+ * @state: A pointer to a recovery journal state to hold the result of a successful decode.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int __must_check
+decode_recovery_journal_state_7_0(u8 *buffer,
+				  size_t *offset,
+				  struct recovery_journal_state_7_0 *state)
+{
+	struct header header;
+	int result;
+	size_t initial_offset;
+	sequence_number_t journal_start;
+	block_count_t logical_blocks_used, block_map_data_blocks;
+
+	vdo_decode_header(buffer, offset, &header);
+	result = vdo_validate_header(&VDO_RECOVERY_JOURNAL_HEADER_7_0, &header, true, __func__);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	initial_offset = *offset;
+	decode_u64_le(buffer, offset, &journal_start);
+	decode_u64_le(buffer, offset, &logical_blocks_used);
+	decode_u64_le(buffer, offset, &block_map_data_blocks);
+
+	result = ASSERT(VDO_RECOVERY_JOURNAL_HEADER_7_0.size == *offset - initial_offset,
+			"decoded recovery journal component size must match header size");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	*state = (struct recovery_journal_state_7_0) {
+		.journal_start = journal_start,
+		.logical_blocks_used = logical_blocks_used,
+		.block_map_data_blocks = block_map_data_blocks,
+	};
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_get_journal_operation_name() - Get the name of a journal operation.
+ * @operation: The operation to name.
+ *
+ * Return: The name of the operation.
+ */
+const char *vdo_get_journal_operation_name(enum journal_operation operation)
+{
+	switch (operation) {
+	case VDO_JOURNAL_DATA_REMAPPING:
+		return "data remapping";
+
+	case VDO_JOURNAL_BLOCK_MAP_REMAPPING:
+		return "block map remapping";
+
+	default:
+		return "unknown journal operation";
+	}
+}
+
+/**
+ * encode_slab_depot_state_2_0() - Encode the state of a slab depot into a buffer.
+ *
+ * Return: UDS_SUCCESS or an error.
+ */
+static void
+encode_slab_depot_state_2_0(u8 *buffer, size_t *offset, struct slab_depot_state_2_0 state)
+{
+	size_t initial_offset;
+
+	vdo_encode_header(buffer, offset, &VDO_SLAB_DEPOT_HEADER_2_0);
+
+	initial_offset = *offset;
+	encode_u64_le(buffer, offset, state.slab_config.slab_blocks);
+	encode_u64_le(buffer, offset, state.slab_config.data_blocks);
+	encode_u64_le(buffer, offset, state.slab_config.reference_count_blocks);
+	encode_u64_le(buffer, offset, state.slab_config.slab_journal_blocks);
+	encode_u64_le(buffer, offset, state.slab_config.slab_journal_flushing_threshold);
+	encode_u64_le(buffer, offset, state.slab_config.slab_journal_blocking_threshold);
+	encode_u64_le(buffer, offset, state.slab_config.slab_journal_scrubbing_threshold);
+	encode_u64_le(buffer, offset, state.first_block);
+	encode_u64_le(buffer, offset, state.last_block);
+	buffer[(*offset)++] = state.zone_count;
+
+	ASSERT_LOG_ONLY(VDO_SLAB_DEPOT_HEADER_2_0.size == *offset - initial_offset,
+			"encoded block map component size must match header size");
+}
+
+/**
+ * decode_slab_depot_state_2_0() - Decode slab depot component state version 2.0 from a buffer.
+ *
+ * Return: UDS_SUCCESS or an error code.
+ */
+static int
+decode_slab_depot_state_2_0(u8 *buffer, size_t *offset, struct slab_depot_state_2_0 *state)
+{
+	struct header header;
+	int result;
+	size_t initial_offset;
+	struct slab_config slab_config;
+	block_count_t count;
+	physical_block_number_t first_block, last_block;
+	zone_count_t zone_count;
+
+	vdo_decode_header(buffer, offset, &header);
+	result = vdo_validate_header(&VDO_SLAB_DEPOT_HEADER_2_0, &header, true, __func__);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	initial_offset = *offset;
+	decode_u64_le(buffer, offset, &count);
+	slab_config.slab_blocks = count;
+
+	decode_u64_le(buffer, offset, &count);
+	slab_config.data_blocks = count;
+
+	decode_u64_le(buffer, offset, &count);
+	slab_config.reference_count_blocks = count;
+
+	decode_u64_le(buffer, offset, &count);
+	slab_config.slab_journal_blocks = count;
+
+	decode_u64_le(buffer, offset, &count);
+	slab_config.slab_journal_flushing_threshold = count;
+
+	decode_u64_le(buffer, offset, &count);
+	slab_config.slab_journal_blocking_threshold = count;
+
+	decode_u64_le(buffer, offset, &count);
+	slab_config.slab_journal_scrubbing_threshold = count;
+
+	decode_u64_le(buffer, offset, &first_block);
+	decode_u64_le(buffer, offset, &last_block);
+	zone_count = buffer[(*offset)++];
+
+	result = ASSERT(VDO_SLAB_DEPOT_HEADER_2_0.size == *offset - initial_offset,
+			"decoded slab depot component size must match header size");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	*state = (struct slab_depot_state_2_0) {
+		.slab_config = slab_config,
+		.first_block = first_block,
+		.last_block = last_block,
+		.zone_count = zone_count,
+	};
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_configure_slab_depot() - Configure the slab depot.
+ * @partition: The slab depot partition
+ * @slab_config: The configuration of a single slab.
+ * @zone_count: The number of zones the depot will use.
+ * @state: The state structure to be configured.
+ *
+ * Configures the slab_depot for the specified storage capacity, finding the number of data blocks
+ * that will fit and still leave room for the depot metadata, then return the saved state for that
+ * configuration.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_configure_slab_depot(const struct partition *partition,
+			     struct slab_config slab_config,
+			     zone_count_t zone_count,
+			     struct slab_depot_state_2_0 *state)
+{
+	block_count_t total_slab_blocks, total_data_blocks;
+	size_t slab_count;
+	physical_block_number_t last_block;
+	block_count_t slab_size = slab_config.slab_blocks;
+
+	uds_log_debug("slabDepot %s(block_count=%llu, first_block=%llu, slab_size=%llu, zone_count=%u)",
+		      __func__,
+		      (unsigned long long) partition->count,
+		      (unsigned long long) partition->offset,
+		      (unsigned long long) slab_size,
+		      zone_count);
+
+	/* We do not allow runt slabs, so we waste up to a slab's worth. */
+	slab_count = (partition->count / slab_size);
+	if (slab_count == 0)
+		return VDO_NO_SPACE;
+
+	if (slab_count > MAX_VDO_SLABS)
+		return VDO_TOO_MANY_SLABS;
+
+	total_slab_blocks = slab_count * slab_config.slab_blocks;
+	total_data_blocks = slab_count * slab_config.data_blocks;
+	last_block = partition->offset + total_slab_blocks;
+
+	*state = (struct slab_depot_state_2_0) {
+		.slab_config = slab_config,
+		.first_block = partition->offset,
+		.last_block = last_block,
+		.zone_count = zone_count,
+	};
+
+	uds_log_debug("slab_depot last_block=%llu, total_data_blocks=%llu, slab_count=%zu, left_over=%llu",
+		      (unsigned long long) last_block,
+		      (unsigned long long) total_data_blocks,
+		      slab_count,
+		      (unsigned long long) (partition->count - (last_block - partition->offset)));
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_configure_slab() - Measure and initialize the configuration to use for each slab.
+ * @slab_size: The number of blocks per slab.
+ * @slab_journal_blocks: The number of blocks for the slab journal.
+ * @slab_config: The slab configuration to initialize.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_configure_slab(block_count_t slab_size,
+		       block_count_t slab_journal_blocks,
+		       struct slab_config *slab_config)
+{
+	block_count_t ref_blocks, meta_blocks, data_blocks;
+	block_count_t flushing_threshold, remaining, blocking_threshold;
+	block_count_t minimal_extra_space, scrubbing_threshold;
+
+	if (slab_journal_blocks >= slab_size)
+		return VDO_BAD_CONFIGURATION;
+
+	/*
+	 * This calculation should technically be a recurrence, but the total number of metadata
+	 * blocks is currently less than a single block of ref_counts, so we'd gain at most one
+	 * data block in each slab with more iteration.
+	 */
+	ref_blocks = vdo_get_saved_reference_count_size(slab_size - slab_journal_blocks);
+	meta_blocks = (ref_blocks + slab_journal_blocks);
+
+	/* Make sure test code hasn't configured slabs to be too small. */
+	if (meta_blocks >= slab_size)
+		return VDO_BAD_CONFIGURATION;
+
+	/*
+	 * If the slab size is very small, assume this must be a unit test and override the number
+	 * of data blocks to be a power of two (wasting blocks in the slab). Many tests need their
+	 * data_blocks fields to be the exact capacity of the configured volume, and that used to
+	 * fall out since they use a power of two for the number of data blocks, the slab size was
+	 * a power of two, and every block in a slab was a data block.
+	 *
+	 * TODO: Try to figure out some way of structuring testParameters and unit tests so this
+	 * hack isn't needed without having to edit several unit tests every time the metadata size
+	 * changes by one block.
+	 */
+	data_blocks = slab_size - meta_blocks;
+	if ((slab_size < 1024) && !is_power_of_2(data_blocks))
+		data_blocks = ((block_count_t) 1 << ilog2(data_blocks));
+
+	/*
+	 * Configure the slab journal thresholds. The flush threshold is 168 of 224 blocks in
+	 * production, or 3/4ths, so we use this ratio for all sizes.
+	 */
+	flushing_threshold = ((slab_journal_blocks * 3) + 3) / 4;
+	/*
+	 * The blocking threshold should be far enough from the flushing threshold to not produce
+	 * delays, but far enough from the end of the journal to allow multiple successive recovery
+	 * failures.
+	 */
+	remaining = slab_journal_blocks - flushing_threshold;
+	blocking_threshold = flushing_threshold + ((remaining * 5) / 7);
+	/* The scrubbing threshold should be at least 2048 entries before the end of the journal. */
+	minimal_extra_space = 1 + (MAXIMUM_VDO_USER_VIOS / VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK);
+	scrubbing_threshold = blocking_threshold;
+	if (slab_journal_blocks > minimal_extra_space)
+		scrubbing_threshold = slab_journal_blocks - minimal_extra_space;
+	if (blocking_threshold > scrubbing_threshold)
+		blocking_threshold = scrubbing_threshold;
+
+	*slab_config = (struct slab_config) {
+		.slab_blocks = slab_size,
+		.data_blocks = data_blocks,
+		.reference_count_blocks = ref_blocks,
+		.slab_journal_blocks = slab_journal_blocks,
+		.slab_journal_flushing_threshold = flushing_threshold,
+		.slab_journal_blocking_threshold = blocking_threshold,
+		.slab_journal_scrubbing_threshold = scrubbing_threshold};
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_decode_slab_journal_entry() - Decode a slab journal entry.
+ * @block: The journal block holding the entry.
+ * @entry_count: The number of the entry.
+ *
+ * Return: The decoded entry.
+ */
+struct slab_journal_entry
+vdo_decode_slab_journal_entry(struct packed_slab_journal_block *block,
+			      journal_entry_count_t entry_count)
+{
+	struct slab_journal_entry entry =
+		vdo_unpack_slab_journal_entry(&block->payload.entries[entry_count]);
+
+	if (block->header.has_block_map_increments &&
+	    ((block->payload.full_entries.entry_types[entry_count / 8] &
+	      ((u8)1 << (entry_count % 8))) != 0))
+		entry.operation = VDO_JOURNAL_BLOCK_MAP_REMAPPING;
+
+	return entry;
+}
+
+/**
+ * allocate_partition() - Allocate a partition and add it to a layout.
+ * @layout: The layout containing the partition.
+ * @id: The id of the partition.
+ * @offset: The offset into the layout at which the partition begins.
+ * @size: The size of the partition in blocks.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int allocate_partition(struct layout *layout,
+			      u8 id,
+			      physical_block_number_t offset,
+			      block_count_t size)
+{
+	struct partition *partition;
+	int result;
+
+	result = UDS_ALLOCATE(1, struct partition, __func__, &partition);
+	if (result != UDS_SUCCESS)
+		return result;
+
+	partition->id = id;
+	partition->offset = offset;
+	partition->count = size;
+	partition->next = layout->head;
+	layout->head = partition;
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * make_partition() - Create a new partition from the beginning or end of the unused space in a
+ *                    layout.
+ * @layout: The layout.
+ * @id: The id of the partition to make.
+ * @size: The number of blocks to carve out; if 0, all remaining space will be used.
+ * @beginning: True if the partition should start at the beginning of the unused space.
+ *
+ * Return: A success or error code, particularly VDO_NO_SPACE if there are fewer than size blocks
+ *         remaining.
+ */
+static int __must_check
+make_partition(struct layout *layout,
+	       enum partition_id id,
+	       block_count_t size,
+	       bool beginning)
+{
+	int result;
+	physical_block_number_t offset;
+	block_count_t free_blocks = layout->last_free - layout->first_free;
+
+	if (size == 0) {
+		if (free_blocks == 0)
+			return VDO_NO_SPACE;
+		size = free_blocks;
+	} else if (size > free_blocks) {
+		return VDO_NO_SPACE;
+	}
+
+	result = vdo_get_partition(layout, id, NULL);
+	if (result != VDO_UNKNOWN_PARTITION)
+		return VDO_PARTITION_EXISTS;
+
+	offset = beginning ? layout->first_free : (layout->last_free - size);
+
+	result = allocate_partition(layout, id, offset, size);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	layout->num_partitions++;
+	if (beginning)
+		layout->first_free += size;
+	else
+		layout->last_free = layout->last_free - size;
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_initialize_layout() - Lay out the partitions of a vdo.
+ * @size: The entire size of the vdo.
+ * @origin: The start of the layout on the underlying storage in blocks.
+ * @block_map_blocks: The size of the block map partition.
+ * @journal_blocks: The size of the journal partition.
+ * @summary_blocks: The size of the slab summary partition.
+ * @layout: The layout to initialize.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_initialize_layout(block_count_t size,
+			  physical_block_number_t offset,
+			  block_count_t block_map_blocks,
+			  block_count_t journal_blocks,
+			  block_count_t summary_blocks,
+			  struct layout *layout)
+{
+	int result;
+	block_count_t necessary_size =
+		(offset + block_map_blocks + journal_blocks + summary_blocks);
+
+	if (necessary_size > size)
+		return uds_log_error_strerror(VDO_NO_SPACE, "Not enough space to make a VDO");
+
+	*layout = (struct layout) {
+		.start = offset,
+		.size = size,
+		.first_free = offset,
+		.last_free = size,
+		.num_partitions = 0,
+		.head = NULL,
+	};
+
+	result = make_partition(layout, VDO_BLOCK_MAP_PARTITION, block_map_blocks, true);
+	if (result != VDO_SUCCESS) {
+		vdo_uninitialize_layout(layout);
+		return result;
+	}
+
+	result = make_partition(layout, VDO_SLAB_SUMMARY_PARTITION, summary_blocks, false);
+	if (result != VDO_SUCCESS) {
+		vdo_uninitialize_layout(layout);
+		return result;
+	}
+
+	result = make_partition(layout, VDO_RECOVERY_JOURNAL_PARTITION, journal_blocks, false);
+	if (result != VDO_SUCCESS) {
+		vdo_uninitialize_layout(layout);
+		return result;
+	}
+
+	result = make_partition(layout, VDO_SLAB_DEPOT_PARTITION, 0, true);
+	if (result != VDO_SUCCESS)
+		vdo_uninitialize_layout(layout);
+
+	return result;
+}
+
+/**
+ * vdo_uninitialize_layout() - Clean up a layout.
+ * @layout: The layout to clean up.
+ *
+ * All partitions created by this layout become invalid pointers.
+ */
+void vdo_uninitialize_layout(struct layout *layout)
+{
+	while (layout->head != NULL) {
+		struct partition *part = layout->head;
+
+		layout->head = part->next;
+		UDS_FREE(part);
+	}
+
+	memset(layout, 0, sizeof(struct layout));
+}
+
+/**
+ * vdo_get_partition() - Get a partition by id.
+ * @layout: The layout from which to get a partition.
+ * @id: The id of the partition.
+ * @partition_ptr: A pointer to hold the partition.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_get_partition(struct layout *layout,
+		      enum partition_id id,
+		      struct partition **partition_ptr)
+{
+	struct partition *partition;
+
+	for (partition = layout->head; partition != NULL; partition = partition->next) {
+		if (partition->id == id) {
+			if (partition_ptr != NULL)
+				*partition_ptr = partition;
+			return VDO_SUCCESS;
+		}
+	}
+
+	return VDO_UNKNOWN_PARTITION;
+}
+
+/**
+ * vdo_get_known_partition() - Get a partition by id from a validated layout.
+ * @layout: The layout from which to get a partition.
+ * @id: The id of the partition.
+ *
+ * Return: the partition
+ */
+struct partition *vdo_get_known_partition(struct layout *layout, enum partition_id id)
+{
+	struct partition *partition;
+	int result = vdo_get_partition(layout, id, &partition);
+
+	ASSERT_LOG_ONLY(result == VDO_SUCCESS, "layout has expected partition: %u", id);
+
+	return partition;
+}
+
+static void encode_layout(u8 *buffer, size_t *offset, const struct layout *layout)
+{
+	const struct partition *partition;
+	size_t initial_offset;
+	struct header header = VDO_LAYOUT_HEADER_3_0;
+
+	STATIC_ASSERT_SIZEOF(enum partition_id, sizeof(u8));
+	ASSERT_LOG_ONLY(layout->num_partitions <= U8_MAX,
+			"layout partition count must fit in a byte");
+
+	vdo_encode_header(buffer, offset, &header);
+
+	initial_offset = *offset;
+	encode_u64_le(buffer, offset, layout->first_free);
+	encode_u64_le(buffer, offset, layout->last_free);
+	buffer[(*offset)++] = layout->num_partitions;
+
+	ASSERT_LOG_ONLY(sizeof(struct layout_3_0) == *offset - initial_offset,
+			"encoded size of a layout header must match structure");
+
+	for (partition = layout->head; partition != NULL; partition = partition->next) {
+		buffer[(*offset)++] = partition->id;
+		encode_u64_le(buffer, offset, partition->offset);
+		/* This field only exists for backwards compatability */
+		encode_u64_le(buffer, offset, 0);
+		encode_u64_le(buffer, offset, partition->count);
+	}
+
+	ASSERT_LOG_ONLY(header.size == *offset - initial_offset,
+			"encoded size of a layout must match header size");
+}
+
+static int
+decode_layout(u8 *buffer,
+	      size_t *offset,
+	      physical_block_number_t start,
+	      block_count_t size,
+	      struct layout *layout)
+{
+	struct header header;
+	struct layout_3_0 layout_header;
+	struct partition *partition;
+	size_t initial_offset;
+	physical_block_number_t first_free, last_free;
+	u8 partition_count;
+	u8 i;
+	int result;
+
+	vdo_decode_header(buffer, offset, &header);
+	/* Layout is variable size, so only do a minimum size check here. */
+	result = vdo_validate_header(&VDO_LAYOUT_HEADER_3_0, &header, false, __func__);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	initial_offset = *offset;
+	decode_u64_le(buffer, offset, &first_free);
+	decode_u64_le(buffer, offset, &last_free);
+	partition_count = buffer[(*offset)++];
+	layout_header = (struct layout_3_0) {
+		.first_free = first_free,
+		.last_free = last_free,
+		.partition_count = partition_count,
+	};
+
+	result = ASSERT(sizeof(struct layout_3_0) == *offset - initial_offset,
+			"decoded size of a layout header must match structure");
+	if (result != VDO_SUCCESS)
+		return result;
+
+	layout->start = start;
+	layout->size = size;
+	layout->first_free = layout_header.first_free;
+	layout->last_free = layout_header.last_free;
+	layout->num_partitions = layout_header.partition_count;
+
+	if (layout->num_partitions > VDO_PARTITION_COUNT)
+		return uds_log_error_strerror(VDO_UNKNOWN_PARTITION,
+					      "layout has extra partitions");
+
+	for (i = 0; i < layout->num_partitions; i++) {
+		u8 id;
+		u64 partition_offset, count;
+
+		id = buffer[(*offset)++];
+		decode_u64_le(buffer, offset, &partition_offset);
+		*offset += sizeof(u64);
+		decode_u64_le(buffer, offset, &count);
+
+		result = allocate_partition(layout, id, partition_offset, count);
+		if (result != VDO_SUCCESS) {
+			vdo_uninitialize_layout(layout);
+			return result;
+		}
+	}
+
+	/* Validate that the layout has all (and only) the required partitions */
+	for (i = 0; i < VDO_PARTITION_COUNT; i++) {
+		result = vdo_get_partition(layout, REQUIRED_PARTITIONS[i], &partition);
+		if (result != VDO_SUCCESS) {
+			vdo_uninitialize_layout(layout);
+			return uds_log_error_strerror(result,
+						      "layout is missing required partition %u",
+						      REQUIRED_PARTITIONS[i]);
+		}
+
+		start += partition->count;
+	}
+
+	if (start != size) {
+		vdo_uninitialize_layout(layout);
+		return uds_log_error_strerror(UDS_BAD_STATE, "partitions do not cover the layout");
+	}
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * pack_vdo_config() - Convert a vdo_config to its packed on-disk representation.
+ * @config: The vdo config to convert.
+ *
+ * Return: The platform-independent representation of the config.
+ */
+static struct packed_vdo_config pack_vdo_config(struct vdo_config config)
+{
+	return (struct packed_vdo_config) {
+		.logical_blocks = __cpu_to_le64(config.logical_blocks),
+		.physical_blocks = __cpu_to_le64(config.physical_blocks),
+		.slab_size = __cpu_to_le64(config.slab_size),
+		.recovery_journal_size = __cpu_to_le64(config.recovery_journal_size),
+		.slab_journal_blocks = __cpu_to_le64(config.slab_journal_blocks),
+	};
+}
+
+/**
+ * pack_vdo_component() - Convert a vdo_component to its packed on-disk representation.
+ * @component: The VDO component data to convert.
+ *
+ * Return: The platform-independent representation of the component.
+ */
+static struct packed_vdo_component_41_0 pack_vdo_component(const struct vdo_component component)
+{
+	return (struct packed_vdo_component_41_0) {
+		.state = __cpu_to_le32(component.state),
+		.complete_recoveries = __cpu_to_le64(component.complete_recoveries),
+		.read_only_recoveries = __cpu_to_le64(component.read_only_recoveries),
+		.config = pack_vdo_config(component.config),
+		.nonce = __cpu_to_le64(component.nonce),
+	};
+}
+
+static void encode_vdo_component(u8 *buffer, size_t *offset, struct vdo_component component)
+{
+	struct packed_vdo_component_41_0 packed;
+
+	encode_version_number(buffer, offset, VDO_COMPONENT_DATA_41_0);
+	packed = pack_vdo_component(component);
+	memcpy(buffer + *offset, &packed, sizeof(packed));
+	*offset += sizeof(packed);
+}
+
+/**
+ * unpack_vdo_config() - Convert a packed_vdo_config to its native in-memory representation.
+ * @config: The packed vdo config to convert.
+ *
+ * Return: The native in-memory representation of the vdo config.
+ */
+static struct vdo_config unpack_vdo_config(struct packed_vdo_config config)
+{
+	return (struct vdo_config) {
+		.logical_blocks = __le64_to_cpu(config.logical_blocks),
+		.physical_blocks = __le64_to_cpu(config.physical_blocks),
+		.slab_size = __le64_to_cpu(config.slab_size),
+		.recovery_journal_size = __le64_to_cpu(config.recovery_journal_size),
+		.slab_journal_blocks = __le64_to_cpu(config.slab_journal_blocks),
+	};
+}
+
+/**
+ * unpack_vdo_component_41_0() - Convert a packed_vdo_component_41_0 to its native in-memory
+ *				 representation.
+ * @component: The packed vdo component data to convert.
+ *
+ * Return: The native in-memory representation of the component.
+ */
+static struct vdo_component
+unpack_vdo_component_41_0(struct packed_vdo_component_41_0 component)
+{
+	return (struct vdo_component) {
+		.state = __le32_to_cpu(component.state),
+		.complete_recoveries = __le64_to_cpu(component.complete_recoveries),
+		.read_only_recoveries = __le64_to_cpu(component.read_only_recoveries),
+		.config = unpack_vdo_config(component.config),
+		.nonce = __le64_to_cpu(component.nonce),
+	};
+}
+
+/**
+ * vdo_decode_component() - Decode the component data for the vdo itself out of the super block.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int decode_vdo_component(u8 *buffer, size_t *offset, struct vdo_component *component)
+{
+	struct version_number version;
+	struct packed_vdo_component_41_0 packed;
+	int result;
+
+	decode_version_number(buffer, offset, &version);
+	result = validate_version(version, VDO_COMPONENT_DATA_41_0, "VDO component data");
+	if (result != VDO_SUCCESS)
+		return result;
+
+	memcpy(&packed, buffer + *offset, sizeof(packed));
+	*offset += sizeof(packed);
+	*component = unpack_vdo_component_41_0(packed);
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_validate_config() - Validate constraints on a VDO config.
+ * @config: The VDO config.
+ * @physical_block_count: The minimum block count of the underlying storage.
+ * @logical_block_count: The expected logical size of the VDO, or 0 if the logical size may be
+ *			 unspecified.
+ *
+ * Return: A success or error code.
+ */
+int vdo_validate_config(const struct vdo_config *config,
+			block_count_t physical_block_count,
+			block_count_t logical_block_count)
+{
+	struct slab_config slab_config;
+	int result;
+
+	result = ASSERT(config->slab_size > 0, "slab size unspecified");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = ASSERT(is_power_of_2(config->slab_size), "slab size must be a power of two");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = ASSERT(config->slab_size <= (1 << MAX_VDO_SLAB_BITS),
+			"slab size must be less than or equal to 2^%d",
+			MAX_VDO_SLAB_BITS);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = ASSERT(config->slab_journal_blocks >= MINIMUM_VDO_SLAB_JOURNAL_BLOCKS,
+			"slab journal size meets minimum size");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = ASSERT(config->slab_journal_blocks <= config->slab_size,
+			"slab journal size is within expected bound");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = vdo_configure_slab(config->slab_size, config->slab_journal_blocks, &slab_config);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = ASSERT((slab_config.data_blocks >= 1),
+			"slab must be able to hold at least one block");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = ASSERT(config->physical_blocks > 0, "physical blocks unspecified");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = ASSERT(config->physical_blocks <= MAXIMUM_VDO_PHYSICAL_BLOCKS,
+			"physical block count %llu exceeds maximum %llu",
+			(unsigned long long) config->physical_blocks,
+			(unsigned long long) MAXIMUM_VDO_PHYSICAL_BLOCKS);
+	if (result != UDS_SUCCESS)
+		return VDO_OUT_OF_RANGE;
+
+	if (physical_block_count != config->physical_blocks) {
+		uds_log_error("A physical size of %llu blocks was specified, not the %llu blocks configured in the vdo super block",
+			      (unsigned long long) physical_block_count,
+			      (unsigned long long) config->physical_blocks);
+		return VDO_PARAMETER_MISMATCH;
+	}
+
+	if (logical_block_count > 0) {
+		result = ASSERT((config->logical_blocks > 0), "logical blocks unspecified");
+		if (result != UDS_SUCCESS)
+			return result;
+
+		if (logical_block_count != config->logical_blocks) {
+			uds_log_error("A logical size of %llu blocks was specified, but that differs from the %llu blocks configured in the vdo super block",
+				      (unsigned long long) logical_block_count,
+				      (unsigned long long) config->logical_blocks);
+			return VDO_PARAMETER_MISMATCH;
+		}
+	}
+
+	result = ASSERT(config->logical_blocks <= MAXIMUM_VDO_LOGICAL_BLOCKS,
+			"logical blocks too large");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = ASSERT(config->recovery_journal_size > 0, "recovery journal size unspecified");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = ASSERT(is_power_of_2(config->recovery_journal_size),
+			"recovery journal size must be a power of two");
+	if (result != UDS_SUCCESS)
+		return result;
+
+	return result;
+}
+
+/**
+ * vdo_destroy_component_states() - Clean up any allocations in a vdo_component_states.
+ * @states: The component states to destroy.
+ */
+void vdo_destroy_component_states(struct vdo_component_states *states)
+{
+	if (states == NULL)
+		return;
+
+	vdo_uninitialize_layout(&states->layout);
+}
+
+/**
+ * decode_components() - Decode the components now that we know the component data is a version we
+ *                       understand.
+ * @buffer: The buffer being decoded.
+ * @offset: The offset to start decoding from.
+ * @geometry: The vdo geometry
+ * @states: An object to hold the successfully decoded state.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int __must_check decode_components(u8 *buffer,
+					  size_t *offset,
+					  struct volume_geometry *geometry,
+					  struct vdo_component_states *states)
+{
+	int result;
+
+	decode_vdo_component(buffer, offset, &states->vdo);
+
+	result = decode_layout(buffer,
+			       offset,
+			       vdo_get_data_region_start(*geometry) + 1,
+			       states->vdo.config.physical_blocks,
+			       &states->layout);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = decode_recovery_journal_state_7_0(buffer, offset, &states->recovery_journal);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = decode_slab_depot_state_2_0(buffer, offset, &states->slab_depot);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = decode_block_map_state_2_0(buffer, offset, &states->block_map);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	ASSERT_LOG_ONLY(*offset == VDO_COMPONENT_DATA_OFFSET + VDO_COMPONENT_DATA_SIZE,
+			"All decoded component data was used");
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_decode_component_states() - Decode the payload of a super block.
+ * @buffer: The buffer containing the encoded super block contents.
+ * @geometry: The vdo geometry
+ * @states: A pointer to hold the decoded states.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_decode_component_states(u8 *buffer,
+				struct volume_geometry *geometry,
+				struct vdo_component_states *states)
+{
+	int result;
+	size_t offset = VDO_COMPONENT_DATA_OFFSET;
+
+	/* Get and check the release version against the one from the geometry. */
+	decode_u32_le(buffer, &offset, &states->release_version);
+	if (states->release_version != geometry->release_version)
+		return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION,
+					      "Geometry release version %u does not match super block release version %u",
+					      geometry->release_version,
+					      states->release_version);
+
+	/* Check the VDO volume version */
+	decode_version_number(buffer, &offset, &states->volume_version);
+	result = validate_version(VDO_VOLUME_VERSION_67_0, states->volume_version, "volume");
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = decode_components(buffer, &offset, geometry, states);
+	if (result != VDO_SUCCESS)
+		vdo_uninitialize_layout(&states->layout);
+
+	return result;
+}
+
+/**
+ * vdo_validate_component_states() - Validate the decoded super block configuration.
+ * @states: The state decoded from the super block.
+ * @geometry_nonce: The nonce from the geometry block.
+ * @physical_size: The minimum block count of the underlying storage.
+ * @logical_size: The expected logical size of the VDO, or 0 if the logical size may be
+ *                unspecified.
+ *
+ * Return: VDO_SUCCESS or an error if the configuration is invalid.
+ */
+int vdo_validate_component_states(struct vdo_component_states *states,
+				  nonce_t geometry_nonce,
+				  block_count_t physical_size,
+				  block_count_t logical_size)
+{
+	if (geometry_nonce != states->vdo.nonce)
+		return uds_log_error_strerror(VDO_BAD_NONCE,
+					      "Geometry nonce %llu does not match superblock nonce %llu",
+					      (unsigned long long) geometry_nonce,
+					      (unsigned long long) states->vdo.nonce);
+
+	return vdo_validate_config(&states->vdo.config, physical_size, logical_size);
+}
+
+/**
+ * vdo_encode_component_states() - Encode the state of all vdo components in the super block.
+ */
+static void
+vdo_encode_component_states(u8 *buffer, size_t *offset, const struct vdo_component_states *states)
+{
+	encode_u32_le(buffer, offset, states->release_version);
+	encode_version_number(buffer, offset, states->volume_version);
+	encode_vdo_component(buffer, offset, states->vdo);
+	encode_layout(buffer, offset, &states->layout);
+	encode_recovery_journal_state_7_0(buffer, offset, states->recovery_journal);
+	encode_slab_depot_state_2_0(buffer, offset, states->slab_depot);
+	encode_block_map_state_2_0(buffer, offset, states->block_map);
+
+	ASSERT_LOG_ONLY(*offset == VDO_COMPONENT_DATA_OFFSET + VDO_COMPONENT_DATA_SIZE,
+			"All super block component data was encoded");
+}
+
+/**
+ * vdo_encode_super_block() - Encode a super block into its on-disk representation.
+ */
+void vdo_encode_super_block(u8 *buffer, struct vdo_component_states *states)
+{
+	u32 checksum;
+	struct header header = SUPER_BLOCK_HEADER_12_0;
+	size_t offset = 0;
+
+	header.size += VDO_COMPONENT_DATA_SIZE;
+	vdo_encode_header(buffer, &offset, &header);
+	vdo_encode_component_states(buffer, &offset, states);
+
+	checksum = vdo_crc32(buffer, offset);
+	encode_u32_le(buffer, &offset, checksum);
+
+	/*
+	 * Even though the buffer is a full block, to avoid the potential corruption from a torn
+	 * write, the entire encoding must fit in the first sector.
+	 */
+	ASSERT_LOG_ONLY(offset <= VDO_SECTOR_SIZE, "entire superblock must fit in one sector");
+}
+
+/**
+ * vdo_decode_super_block() - Decode a super block from its on-disk representation.
+ */
+int vdo_decode_super_block(u8 *buffer)
+{
+	struct header header;
+	int result;
+	u32 checksum, saved_checksum;
+	size_t offset = 0;
+
+	/* Decode and validate the header. */
+	vdo_decode_header(buffer, &offset, &header);
+	result = vdo_validate_header(&SUPER_BLOCK_HEADER_12_0, &header, false, __func__);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	if (header.size > VDO_COMPONENT_DATA_SIZE + sizeof(u32))
+		/*
+		 * We can't check release version or checksum until we know the content size, so we
+		 * have to assume a version mismatch on unexpected values.
+		 */
+		return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION,
+					      "super block contents too large: %zu",
+					      header.size);
+
+	/* Skip past the component data for now, to verify the checksum. */
+	offset += VDO_COMPONENT_DATA_SIZE;
+
+	checksum = vdo_crc32(buffer, offset);
+	decode_u32_le(buffer, &offset, &saved_checksum);
+
+	result = ASSERT(offset == VDO_SUPER_BLOCK_FIXED_SIZE + VDO_COMPONENT_DATA_SIZE,
+			"must have decoded entire superblock payload");
+	if (result != VDO_SUCCESS)
+		return result;
+
+	return ((checksum != saved_checksum) ? VDO_CHECKSUM_MISMATCH : VDO_SUCCESS);
+}
diff --git a/drivers/md/dm-vdo/encodings.h b/drivers/md/dm-vdo/encodings.h
new file mode 100644
index 00000000000..8432112e928
--- /dev/null
+++ b/drivers/md/dm-vdo/encodings.h
@@ -0,0 +1,1307 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_ENCODINGS_H
+#define VDO_ENCODINGS_H
+
+#include <linux/blk_types.h>
+#include <linux/crc32.h>
+#include <linux/limits.h>
+#include <linux/uuid.h>
+
+#include "numeric.h"
+#include "uds.h"
+
+#include "constants.h"
+#include "types.h"
+
+/*
+ * An in-memory representation of a version number for versioned structures on disk.
+ *
+ * A version number consists of two portions, a major version and a minor version. Any format
+ * change which does not require an explicit upgrade step from the previous version should
+ * increment the minor version. Any format change which either requires an explicit upgrade step,
+ * or is wholly incompatible (i.e. can not be upgraded to), should increment the major version, and
+ * set the minor version to 0.
+ */
+struct version_number {
+	u32 major_version;
+	u32 minor_version;
+};
+
+/*
+ * A packed, machine-independent, on-disk representation of a version_number. Both fields are
+ * stored in little-endian byte order.
+ */
+struct packed_version_number {
+	__le32 major_version;
+	__le32 minor_version;
+} __packed;
+
+/* The registry of component ids for use in headers */
+#define VDO_SUPER_BLOCK 0
+#define VDO_LAYOUT 1
+#define VDO_RECOVERY_JOURNAL 2
+#define VDO_SLAB_DEPOT 3
+#define VDO_BLOCK_MAP 4
+#define VDO_GEOMETRY_BLOCK 5
+
+/* The header for versioned data stored on disk. */
+struct header {
+	u32 id; /* The component this is a header for */
+	struct version_number version; /* The version of the data format */
+	size_t size; /* The size of the data following this header */
+};
+
+/* A packed, machine-independent, on-disk representation of a component header. */
+struct packed_header {
+	__le32 id;
+	struct packed_version_number version;
+	__le64 size;
+} __packed;
+
+enum {
+	VDO_GEOMETRY_BLOCK_LOCATION = 0,
+	VDO_GEOMETRY_MAGIC_NUMBER_SIZE = 8,
+	VDO_DEFAULT_GEOMETRY_BLOCK_VERSION = 5,
+};
+
+struct index_config {
+	u32 mem;
+	u32 unused;
+	bool sparse;
+} __packed;
+
+enum volume_region_id {
+	VDO_INDEX_REGION = 0,
+	VDO_DATA_REGION = 1,
+	VDO_VOLUME_REGION_COUNT,
+};
+
+struct volume_region {
+	/* The ID of the region */
+	enum volume_region_id id;
+	/*
+	 * The absolute starting offset on the device. The region continues until the next region
+	 * begins.
+	 */
+	physical_block_number_t start_block;
+} __packed;
+
+struct volume_geometry {
+	/* The release version number of this volume */
+	release_version_number_t release_version;
+	/* The nonce of this volume */
+	nonce_t nonce;
+	/* The uuid of this volume */
+	uuid_t uuid;
+	/* The block offset to be applied to bios */
+	block_count_t bio_offset;
+	/* The regions in ID order */
+	struct volume_region regions[VDO_VOLUME_REGION_COUNT];
+	/* The index config */
+	struct index_config index_config;
+} __packed;
+
+/* This volume geometry struct is used for sizing only */
+struct volume_geometry_4_0 {
+	/* The release version number of this volume */
+	release_version_number_t release_version;
+	/* The nonce of this volume */
+	nonce_t nonce;
+	/* The uuid of this volume */
+	uuid_t uuid;
+	/* The regions in ID order */
+	struct volume_region regions[VDO_VOLUME_REGION_COUNT];
+	/* The index config */
+	struct index_config index_config;
+} __packed;
+
+extern const u8 VDO_GEOMETRY_MAGIC_NUMBER[VDO_GEOMETRY_MAGIC_NUMBER_SIZE + 1];
+
+/**
+ * DOC: Block map entries
+ *
+ * The entry for each logical block in the block map is encoded into five bytes, which saves space
+ * in both the on-disk and in-memory layouts. It consists of the 36 low-order bits of a
+ * physical_block_number_t (addressing 256 terabytes with a 4KB block size) and a 4-bit encoding of
+ * a block_mapping_state.
+ *
+ * Of the 8 high bits of the 5-byte structure:
+ *
+ * Bits 7..4: The four highest bits of the 36-bit physical block number
+ * Bits 3..0: The 4-bit block_mapping_state
+ *
+ * The following 4 bytes are the low order bytes of the physical block number, in little-endian
+ * order.
+ *
+ * Conversion functions to and from a data location are provided.
+ */
+struct block_map_entry {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	unsigned mapping_state : 4;
+	unsigned pbn_high_nibble : 4;
+#else
+	unsigned pbn_high_nibble : 4;
+	unsigned mapping_state : 4;
+#endif
+
+	__le32 pbn_low_word;
+} __packed;
+
+struct block_map_page_header {
+	__le64 nonce;
+	__le64 pbn;
+
+	/** May be non-zero on disk */
+	u8 unused_long_word[8];
+
+	/* Whether this page has been written twice to disk */
+	bool initialized;
+
+	/* Always zero on disk */
+	u8 unused_byte1;
+
+	/* May be non-zero on disk */
+	u8 unused_byte2;
+	u8 unused_byte3;
+} __packed;
+
+struct block_map_page {
+	struct packed_version_number version;
+	struct block_map_page_header header;
+	struct block_map_entry entries[];
+} __packed;
+
+enum block_map_page_validity {
+	VDO_BLOCK_MAP_PAGE_VALID,
+	VDO_BLOCK_MAP_PAGE_INVALID,
+	/* Valid page found in the wrong location on disk */
+	VDO_BLOCK_MAP_PAGE_BAD,
+};
+
+struct block_map_state_2_0 {
+	physical_block_number_t flat_page_origin;
+	block_count_t flat_page_count;
+	physical_block_number_t root_origin;
+	block_count_t root_count;
+} __packed;
+
+struct boundary {
+	page_number_t levels[VDO_BLOCK_MAP_TREE_HEIGHT];
+};
+
+extern const struct header VDO_BLOCK_MAP_HEADER_2_0;
+
+/* The state of the recovery journal as encoded in the VDO super block. */
+struct recovery_journal_state_7_0 {
+	/** Sequence number to start the journal */
+	sequence_number_t journal_start;
+	/** Number of logical blocks used by VDO */
+	block_count_t logical_blocks_used;
+	/** Number of block map pages allocated */
+	block_count_t block_map_data_blocks;
+} __packed;
+
+extern const struct header VDO_RECOVERY_JOURNAL_HEADER_7_0;
+
+typedef u16 journal_entry_count_t;
+
+/*
+ * A recovery journal entry stores three physical locations: a data location that is the value of a
+ * single mapping in the block map tree, and the two locations of the block map pages and slots
+ * that are acquiring and releasing a reference to the location. The journal entry also stores an
+ * operation code that says whether the mapping is for a logical block or for the block map tree
+ * itself.
+ */
+struct recovery_journal_entry {
+	struct block_map_slot slot;
+	struct data_location mapping;
+	struct data_location unmapping;
+	enum journal_operation operation;
+};
+
+/* The packed, on-disk representation of a recovery journal entry. */
+struct packed_recovery_journal_entry {
+	/*
+	 * In little-endian bit order:
+	 * Bits 15..12: The four highest bits of the 36-bit physical block number of the block map
+	 * tree page
+	 * Bits 11..2: The 10-bit block map page slot number
+	 * Bit 1..0: The journal_operation of the entry (this actually only requires 1 bit, but
+	 *           it is convenient to keep the extra bit as part of this field.
+	 */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	unsigned operation : 2;
+	unsigned slot_low : 6;
+	unsigned slot_high : 4;
+	unsigned pbn_high_nibble : 4;
+#else
+	unsigned slot_low : 6;
+	unsigned operation : 2;
+	unsigned pbn_high_nibble : 4;
+	unsigned slot_high : 4;
+#endif
+
+	/*
+	 * Bits 47..16: The 32 low-order bits of the block map page PBN, in little-endian byte
+	 * order
+	 */
+	__le32 pbn_low_word;
+
+	/*
+	 * Bits 87..48: The five-byte block map entry encoding the location that will be stored in
+	 * the block map page slot
+	 */
+	struct block_map_entry mapping;
+
+	/*
+	 * Bits 127..88: The five-byte block map entry encoding the location that was stored in the
+	 * block map page slot
+	 */
+	struct block_map_entry unmapping;
+} __packed;
+
+/* The packed, on-disk representation of an old format recovery journal entry. */
+struct packed_recovery_journal_entry_1 {
+	/*
+	 * In little-endian bit order:
+	 * Bits 15..12: The four highest bits of the 36-bit physical block number of the block map
+	 *              tree page
+	 * Bits 11..2: The 10-bit block map page slot number
+	 * Bits 1..0: The 2-bit journal_operation of the entry
+	 *
+	 */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	unsigned operation : 2;
+	unsigned slot_low : 6;
+	unsigned slot_high : 4;
+	unsigned pbn_high_nibble : 4;
+#else
+	unsigned slot_low : 6;
+	unsigned operation : 2;
+	unsigned pbn_high_nibble : 4;
+	unsigned slot_high : 4;
+#endif
+
+	/*
+	 * Bits 47..16: The 32 low-order bits of the block map page PBN, in little-endian byte
+	 * order
+	 */
+	__le32 pbn_low_word;
+
+	/*
+	 * Bits 87..48: The five-byte block map entry encoding the location that was or will be
+	 * stored in the block map page slot
+	 */
+	struct block_map_entry block_map_entry;
+} __packed;
+
+enum journal_operation_1 {
+	VDO_JOURNAL_DATA_DECREMENT = 0,
+	VDO_JOURNAL_DATA_INCREMENT = 1,
+	VDO_JOURNAL_BLOCK_MAP_DECREMENT = 2,
+	VDO_JOURNAL_BLOCK_MAP_INCREMENT = 3,
+} __packed;
+
+struct recovery_block_header {
+	sequence_number_t block_map_head; /* Block map head sequence number */
+	sequence_number_t slab_journal_head; /* Slab journal head seq. number */
+	sequence_number_t sequence_number; /* Sequence number for this block */
+	nonce_t nonce; /* A given VDO instance's nonce */
+	block_count_t logical_blocks_used; /* Logical blocks in use */
+	block_count_t block_map_data_blocks; /* Allocated block map pages */
+	journal_entry_count_t entry_count; /* Number of entries written */
+	u8 check_byte; /* The protection check byte */
+	u8 recovery_count; /* Number of recoveries completed */
+	enum vdo_metadata_type metadata_type; /* Metadata type */
+};
+
+/*
+ * The packed, on-disk representation of a recovery journal block header. All fields are kept in
+ * little-endian byte order.
+ */
+struct packed_journal_header {
+	/* Block map head 64-bit sequence number */
+	__le64 block_map_head;
+
+	/* Slab journal head 64-bit sequence number */
+	__le64 slab_journal_head;
+
+	/* The 64-bit sequence number for this block */
+	__le64 sequence_number;
+
+	/* A given VDO instance's 64-bit nonce */
+	__le64 nonce;
+
+	/* 8-bit metadata type (should always be one for the recovery journal) */
+	u8 metadata_type;
+
+	/* 16-bit count of the entries encoded in the block */
+	__le16 entry_count;
+
+	/* 64-bit count of the logical blocks used when this block was opened */
+	__le64 logical_blocks_used;
+
+	/* 64-bit count of the block map blocks used when this block was opened */
+	__le64 block_map_data_blocks;
+
+	/* The protection check byte */
+	u8 check_byte;
+
+	/* The number of recoveries completed */
+	u8 recovery_count;
+} __packed;
+
+struct packed_journal_sector {
+	/* The protection check byte */
+	u8 check_byte;
+
+	/* The number of recoveries completed */
+	u8 recovery_count;
+
+	/* The number of entries in this sector */
+	u8 entry_count;
+
+	/* Journal entries for this sector */
+	struct packed_recovery_journal_entry entries[];
+} __packed;
+
+enum {
+	/* The number of entries in each sector (except the last) when filled */
+	RECOVERY_JOURNAL_ENTRIES_PER_SECTOR =
+		((VDO_SECTOR_SIZE - sizeof(struct packed_journal_sector)) /
+		 sizeof(struct packed_recovery_journal_entry)),
+	RECOVERY_JOURNAL_ENTRIES_PER_BLOCK = RECOVERY_JOURNAL_ENTRIES_PER_SECTOR * 7,
+	/* The number of entries in a v1 recovery journal block. */
+	RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK = 311,
+	/* The number of entries in each v1 sector (except the last) when filled */
+	RECOVERY_JOURNAL_1_ENTRIES_PER_SECTOR =
+		((VDO_SECTOR_SIZE - sizeof(struct packed_journal_sector)) /
+		 sizeof(struct packed_recovery_journal_entry_1)),
+	/* The number of entries in the last sector when a block is full */
+	RECOVERY_JOURNAL_1_ENTRIES_IN_LAST_SECTOR =
+		(RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK % RECOVERY_JOURNAL_1_ENTRIES_PER_SECTOR),
+};
+
+/* A type representing a reference count of a block. */
+typedef u8 vdo_refcount_t;
+
+/* The absolute position of an entry in a recovery journal or slab journal. */
+struct journal_point {
+	sequence_number_t sequence_number;
+	journal_entry_count_t entry_count;
+};
+
+/* A packed, platform-independent encoding of a struct journal_point. */
+struct packed_journal_point {
+	/*
+	 * The packed representation is the little-endian 64-bit representation of the low-order 48
+	 * bits of the sequence number, shifted up 16 bits, or'ed with the 16-bit entry count.
+	 *
+	 * Very long-term, the top 16 bits of the sequence number may not always be zero, as this
+	 * encoding assumes--see BZ 1523240.
+	 */
+	__le64 encoded_point;
+} __packed;
+
+/* Special vdo_refcount_t values. */
+#define EMPTY_REFERENCE_COUNT 0
+enum {
+	MAXIMUM_REFERENCE_COUNT = 254,
+	PROVISIONAL_REFERENCE_COUNT = 255,
+};
+
+enum {
+	COUNTS_PER_SECTOR =
+		((VDO_SECTOR_SIZE - sizeof(struct packed_journal_point)) / sizeof(vdo_refcount_t)),
+	COUNTS_PER_BLOCK = COUNTS_PER_SECTOR * VDO_SECTORS_PER_BLOCK,
+};
+
+/* The format of each sector of a reference_block on disk. */
+struct packed_reference_sector {
+	struct packed_journal_point commit_point;
+	vdo_refcount_t counts[COUNTS_PER_SECTOR];
+} __packed;
+
+struct packed_reference_block {
+	struct packed_reference_sector sectors[VDO_SECTORS_PER_BLOCK];
+};
+
+struct slab_depot_state_2_0 {
+	struct slab_config slab_config;
+	physical_block_number_t first_block;
+	physical_block_number_t last_block;
+	zone_count_t zone_count;
+} __packed;
+
+extern const struct header VDO_SLAB_DEPOT_HEADER_2_0;
+
+/*
+ * vdo_slab journal blocks may have one of two formats, depending upon whether or not any of the
+ * entries in the block are block map increments. Since the steady state for a VDO is that all of
+ * the necessary block map pages will be allocated, most slab journal blocks will have only data
+ * entries. Such blocks can hold more entries, hence the two formats.
+ */
+
+/* A single slab journal entry */
+struct slab_journal_entry {
+	slab_block_number sbn;
+	enum journal_operation operation;
+	bool increment;
+};
+
+/* A single slab journal entry in its on-disk form */
+typedef struct {
+	u8 offset_low8;
+	u8 offset_mid8;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	unsigned offset_high7 : 7;
+	unsigned increment : 1;
+#else
+	unsigned increment : 1;
+	unsigned offset_high7 : 7;
+#endif
+} __packed packed_slab_journal_entry;
+
+/* The unpacked representation of the header of a slab journal block */
+struct slab_journal_block_header {
+	/* Sequence number for head of journal */
+	sequence_number_t head;
+	/* Sequence number for this block */
+	sequence_number_t sequence_number;
+	/* The nonce for a given VDO instance */
+	nonce_t nonce;
+	/* Recovery journal point for last entry */
+	struct journal_point recovery_point;
+	/* Metadata type */
+	enum vdo_metadata_type metadata_type;
+	/* Whether this block contains block map increments */
+	bool has_block_map_increments;
+	/* The number of entries in the block */
+	journal_entry_count_t entry_count;
+};
+
+/*
+ * The packed, on-disk representation of a slab journal block header. All fields are kept in
+ * little-endian byte order.
+ */
+struct packed_slab_journal_block_header {
+	/* 64-bit sequence number for head of journal */
+	__le64 head;
+	/* 64-bit sequence number for this block */
+	__le64 sequence_number;
+	/* Recovery journal point for the last entry, packed into 64 bits */
+	struct packed_journal_point recovery_point;
+	/* The 64-bit nonce for a given VDO instance */
+	__le64 nonce;
+	/* 8-bit metadata type (should always be two, for the slab journal) */
+	u8 metadata_type;
+	/* Whether this block contains block map increments */
+	bool has_block_map_increments;
+	/* 16-bit count of the entries encoded in the block */
+	__le16 entry_count;
+} __packed;
+
+enum {
+	VDO_SLAB_JOURNAL_PAYLOAD_SIZE =
+		VDO_BLOCK_SIZE - sizeof(struct packed_slab_journal_block_header),
+	VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK = (VDO_SLAB_JOURNAL_PAYLOAD_SIZE * 8) / 25,
+	VDO_SLAB_JOURNAL_ENTRY_TYPES_SIZE =
+		((VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK - 1) / 8) + 1,
+	VDO_SLAB_JOURNAL_ENTRIES_PER_BLOCK =
+		(VDO_SLAB_JOURNAL_PAYLOAD_SIZE / sizeof(packed_slab_journal_entry)),
+};
+
+/* The payload of a slab journal block which has block map increments */
+struct full_slab_journal_entries {
+	/* The entries themselves */
+	packed_slab_journal_entry entries[VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK];
+	/* The bit map indicating which entries are block map increments */
+	u8 entry_types[VDO_SLAB_JOURNAL_ENTRY_TYPES_SIZE];
+} __packed;
+
+typedef union {
+	/* Entries which include block map increments */
+	struct full_slab_journal_entries full_entries;
+	/* Entries which are only data updates */
+	packed_slab_journal_entry entries[VDO_SLAB_JOURNAL_ENTRIES_PER_BLOCK];
+	/* Ensure the payload fills to the end of the block */
+	u8 space[VDO_SLAB_JOURNAL_PAYLOAD_SIZE];
+} __packed slab_journal_payload;
+
+struct packed_slab_journal_block {
+	struct packed_slab_journal_block_header header;
+	slab_journal_payload payload;
+} __packed;
+
+/* The offset of a slab journal tail block. */
+typedef u8 tail_block_offset_t;
+
+struct slab_summary_entry {
+	/* Bits 7..0: The offset of the tail block within the slab journal */
+	tail_block_offset_t tail_block_offset;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	/* Bits 13..8: A hint about the fullness of the slab */
+	unsigned int fullness_hint : 6;
+	/* Bit 14: Whether the ref_counts must be loaded from the layer */
+	unsigned int load_ref_counts : 1;
+	/* Bit 15: The believed cleanliness of this slab */
+	unsigned int is_dirty : 1;
+#else
+	/* Bit 15: The believed cleanliness of this slab */
+	unsigned int is_dirty : 1;
+	/* Bit 14: Whether the ref_counts must be loaded from the layer */
+	unsigned int load_ref_counts : 1;
+	/* Bits 13..8: A hint about the fullness of the slab */
+	unsigned int fullness_hint : 6;
+#endif
+} __packed;
+
+enum {
+	VDO_SLAB_SUMMARY_FULLNESS_HINT_BITS = 6,
+	VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK = VDO_BLOCK_SIZE / sizeof(struct slab_summary_entry),
+	VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE = MAX_VDO_SLABS / VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK,
+	VDO_SLAB_SUMMARY_BLOCKS = VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE * MAX_VDO_PHYSICAL_ZONES,
+};
+
+struct layout {
+	physical_block_number_t start;
+	block_count_t size;
+	physical_block_number_t first_free;
+	physical_block_number_t last_free;
+	size_t num_partitions;
+	struct partition *head;
+};
+
+struct partition {
+	enum partition_id id; /* The id of this partition */
+	physical_block_number_t offset; /* The offset into the layout of this partition */
+	block_count_t count; /* The number of blocks in the partition */
+	struct partition *next; /* A pointer to the next partition in the layout */
+};
+
+struct layout_3_0 {
+	physical_block_number_t first_free;
+	physical_block_number_t last_free;
+	u8 partition_count;
+} __packed;
+
+struct partition_3_0 {
+	enum partition_id id;
+	physical_block_number_t offset;
+	physical_block_number_t base; /* unused but retained for backwards compatability */
+	block_count_t count;
+} __packed;
+
+/*
+ * The configuration of the VDO service.
+ */
+struct vdo_config {
+	block_count_t logical_blocks; /* number of logical blocks */
+	block_count_t physical_blocks; /* number of physical blocks */
+	block_count_t slab_size; /* number of blocks in a slab */
+	block_count_t recovery_journal_size; /* number of recovery journal blocks */
+	block_count_t slab_journal_blocks; /* number of slab journal blocks */
+};
+
+/* This is the structure that captures the vdo fields saved as a super block component. */
+struct vdo_component {
+	enum vdo_state state;
+	u64 complete_recoveries;
+	u64 read_only_recoveries;
+	struct vdo_config config;
+	nonce_t nonce;
+};
+
+/*
+ * A packed, machine-independent, on-disk representation of the vdo_config in the VDO component
+ * data in the super block.
+ */
+struct packed_vdo_config {
+	__le64 logical_blocks;
+	__le64 physical_blocks;
+	__le64 slab_size;
+	__le64 recovery_journal_size;
+	__le64 slab_journal_blocks;
+} __packed;
+
+/*
+ * A packed, machine-independent, on-disk representation of version 41.0 of the VDO component data
+ * in the super block.
+ */
+struct packed_vdo_component_41_0 {
+	__le32 state;
+	__le64 complete_recoveries;
+	__le64 read_only_recoveries;
+	struct packed_vdo_config config;
+	__le64 nonce;
+} __packed;
+
+/*
+ * The version of the on-disk format of a VDO volume. This should be incremented any time the
+ * on-disk representation of any VDO structure changes. Changes which require only online upgrade
+ * steps should increment the minor version. Changes which require an offline upgrade or which can
+ * not be upgraded to at all should increment the major version and set the minor version to 0.
+ */
+extern const struct version_number VDO_VOLUME_VERSION_67_0;
+
+enum {
+	VDO_ENCODED_HEADER_SIZE = sizeof(struct packed_header),
+	BLOCK_MAP_COMPONENT_ENCODED_SIZE =
+		VDO_ENCODED_HEADER_SIZE + sizeof(struct block_map_state_2_0),
+	RECOVERY_JOURNAL_COMPONENT_ENCODED_SIZE =
+		VDO_ENCODED_HEADER_SIZE + sizeof(struct recovery_journal_state_7_0),
+	SLAB_DEPOT_COMPONENT_ENCODED_SIZE =
+		VDO_ENCODED_HEADER_SIZE + sizeof(struct slab_depot_state_2_0),
+	VDO_PARTITION_COUNT = 4,
+	VDO_LAYOUT_ENCODED_SIZE = (VDO_ENCODED_HEADER_SIZE +
+				   sizeof(struct layout_3_0) +
+				   (sizeof(struct partition_3_0) * VDO_PARTITION_COUNT)),
+	VDO_SUPER_BLOCK_FIXED_SIZE = VDO_ENCODED_HEADER_SIZE + sizeof(u32),
+	VDO_MAX_COMPONENT_DATA_SIZE = VDO_SECTOR_SIZE - VDO_SUPER_BLOCK_FIXED_SIZE,
+	VDO_COMPONENT_ENCODED_SIZE =
+		(sizeof(struct packed_version_number) + sizeof(struct packed_vdo_component_41_0)),
+	VDO_COMPONENT_DATA_OFFSET = VDO_ENCODED_HEADER_SIZE,
+	VDO_COMPONENT_DATA_SIZE = (sizeof(release_version_number_t) +
+				   sizeof(struct packed_version_number) +
+				   VDO_COMPONENT_ENCODED_SIZE +
+				   VDO_LAYOUT_ENCODED_SIZE +
+				   RECOVERY_JOURNAL_COMPONENT_ENCODED_SIZE +
+				   SLAB_DEPOT_COMPONENT_ENCODED_SIZE +
+				   BLOCK_MAP_COMPONENT_ENCODED_SIZE),
+};
+
+/* The entirety of the component data encoded in the VDO super block. */
+struct vdo_component_states {
+	/* The release version */
+	release_version_number_t release_version;
+
+	/* The VDO volume version */
+	struct version_number volume_version;
+
+	/* Components */
+	struct vdo_component vdo;
+	struct block_map_state_2_0 block_map;
+	struct recovery_journal_state_7_0 recovery_journal;
+	struct slab_depot_state_2_0 slab_depot;
+
+	/* Our partitioning of the underlying storage */
+	struct layout layout;
+};
+
+/**
+ * vdo_are_same_version() - Check whether two version numbers are the same.
+ * @version_a: The first version.
+ * @version_b: The second version.
+ *
+ * Return: true if the two versions are the same.
+ */
+static inline bool
+vdo_are_same_version(struct version_number version_a, struct version_number version_b)
+{
+	return ((version_a.major_version == version_b.major_version) &&
+		(version_a.minor_version == version_b.minor_version));
+}
+
+/**
+ * vdo_is_upgradable_version() - Check whether an actual version is upgradable to an expected
+ *                               version.
+ * @expected_version: The expected version.
+ * @actual_version: The version being validated.
+ *
+ * An actual version is upgradable if its major number is expected but its minor number differs,
+ * and the expected version's minor number is greater than the actual version's minor number.
+ *
+ * Return: true if the actual version is upgradable.
+ */
+static inline bool vdo_is_upgradable_version(struct version_number expected_version,
+					     struct version_number actual_version)
+{
+	return ((expected_version.major_version == actual_version.major_version) &&
+		(expected_version.minor_version > actual_version.minor_version));
+}
+
+int __must_check vdo_validate_header(const struct header *expected_header,
+				     const struct header *actual_header,
+				     bool exact_size,
+				     const char *component_name);
+
+void vdo_encode_header(u8 *buffer, size_t *offset, const struct header *header);
+void vdo_decode_header(u8 *buffer, size_t *offset, struct header *header);
+
+/**
+ * vdo_pack_version_number() - Convert a version_number to its packed on-disk representation.
+ * @version: The version number to convert.
+ *
+ * Return: the platform-independent representation of the version
+ */
+static inline struct packed_version_number vdo_pack_version_number(struct version_number version)
+{
+	return (struct packed_version_number) {
+		.major_version = __cpu_to_le32(version.major_version),
+		.minor_version = __cpu_to_le32(version.minor_version),
+	};
+}
+
+/**
+ * vdo_unpack_version_number() - Convert a packed_version_number to its native in-memory
+ *                               representation.
+ * @version: The version number to convert.
+ *
+ * Return: The platform-independent representation of the version.
+ */
+static inline struct version_number vdo_unpack_version_number(struct packed_version_number version)
+{
+	return (struct version_number) {
+		.major_version = __le32_to_cpu(version.major_version),
+		.minor_version = __le32_to_cpu(version.minor_version),
+	};
+}
+
+/**
+ * vdo_pack_header() - Convert a component header to its packed on-disk representation.
+ * @header: The header to convert.
+ *
+ * Return: the platform-independent representation of the header
+ */
+static inline struct packed_header vdo_pack_header(const struct header *header)
+{
+	return (struct packed_header) {
+		.id = __cpu_to_le32(header->id),
+		.version = vdo_pack_version_number(header->version),
+		.size = __cpu_to_le64(header->size),
+	};
+}
+
+/**
+ * vdo_unpack_header() - Convert a packed_header to its native in-memory representation.
+ * @header: The header to convert.
+ *
+ * Return: The platform-independent representation of the version.
+ */
+static inline struct header vdo_unpack_header(const struct packed_header *header)
+{
+	return (struct header) {
+		.id = __le32_to_cpu(header->id),
+		.version = vdo_unpack_version_number(header->version),
+		.size = __le64_to_cpu(header->size),
+	};
+}
+
+/**
+ * vdo_get_index_region_start() - Get the start of the index region from a geometry.
+ * @geometry: The geometry.
+ *
+ * Return: The start of the index region.
+ */
+static inline physical_block_number_t __must_check
+vdo_get_index_region_start(struct volume_geometry geometry)
+{
+	return geometry.regions[VDO_INDEX_REGION].start_block;
+}
+
+/**
+ * vdo_get_data_region_start() - Get the start of the data region from a geometry.
+ * @geometry: The geometry.
+ *
+ * Return: The start of the data region.
+ */
+static inline physical_block_number_t __must_check
+vdo_get_data_region_start(struct volume_geometry geometry)
+{
+	return geometry.regions[VDO_DATA_REGION].start_block;
+}
+
+/**
+ * vdo_get_index_region_size() - Get the size of the index region from a geometry.
+ * @geometry: The geometry.
+ *
+ * Return: The size of the index region.
+ */
+static inline physical_block_number_t __must_check
+vdo_get_index_region_size(struct volume_geometry geometry)
+{
+	return vdo_get_data_region_start(geometry) -
+		vdo_get_index_region_start(geometry);
+}
+
+int __must_check vdo_parse_geometry_block(unsigned char *block, struct volume_geometry *geometry);
+
+static inline bool vdo_is_state_compressed(const enum block_mapping_state mapping_state)
+{
+	return (mapping_state > VDO_MAPPING_STATE_UNCOMPRESSED);
+}
+
+static inline struct block_map_entry
+vdo_pack_block_map_entry(physical_block_number_t pbn, enum block_mapping_state mapping_state)
+{
+	return (struct block_map_entry) {
+		.mapping_state = (mapping_state & 0x0F),
+		.pbn_high_nibble = ((pbn >> 32) & 0x0F),
+		.pbn_low_word = __cpu_to_le32(pbn & UINT_MAX),
+	};
+}
+
+static inline struct data_location vdo_unpack_block_map_entry(const struct block_map_entry *entry)
+{
+	physical_block_number_t low32 = __le32_to_cpu(entry->pbn_low_word);
+	physical_block_number_t high4 = entry->pbn_high_nibble;
+
+	return (struct data_location) {
+		.pbn = ((high4 << 32) | low32),
+		.state = entry->mapping_state,
+	};
+}
+
+static inline bool vdo_is_mapped_location(const struct data_location *location)
+{
+	return (location->state != VDO_MAPPING_STATE_UNMAPPED);
+}
+
+static inline bool vdo_is_valid_location(const struct data_location *location)
+{
+	if (location->pbn == VDO_ZERO_BLOCK)
+		return !vdo_is_state_compressed(location->state);
+	else
+		return vdo_is_mapped_location(location);
+}
+
+static inline physical_block_number_t __must_check
+vdo_get_block_map_page_pbn(const struct block_map_page *page)
+{
+	return __le64_to_cpu(page->header.pbn);
+}
+
+struct block_map_page *vdo_format_block_map_page(void *buffer,
+						 nonce_t nonce,
+						 physical_block_number_t pbn,
+						 bool initialized);
+
+enum block_map_page_validity __must_check
+vdo_validate_block_map_page(struct block_map_page *page,
+			    nonce_t nonce,
+			    physical_block_number_t pbn);
+
+static inline page_count_t vdo_compute_block_map_page_count(block_count_t entries)
+{
+	return DIV_ROUND_UP(entries, VDO_BLOCK_MAP_ENTRIES_PER_PAGE);
+}
+
+block_count_t __must_check
+vdo_compute_new_forest_pages(root_count_t root_count,
+			     struct boundary *old_sizes,
+			     block_count_t entries,
+			     struct boundary *new_sizes);
+
+/**
+ * vdo_pack_recovery_journal_entry() - Return the packed, on-disk representation of a recovery
+ *                                     journal entry.
+ * @entry: The journal entry to pack.
+ *
+ * Return: The packed representation of the journal entry.
+ */
+static inline struct packed_recovery_journal_entry
+vdo_pack_recovery_journal_entry(const struct recovery_journal_entry *entry)
+{
+	return (struct packed_recovery_journal_entry) {
+		.operation = entry->operation,
+		.slot_low = entry->slot.slot & 0x3F,
+		.slot_high = (entry->slot.slot >> 6) & 0x0F,
+		.pbn_high_nibble = (entry->slot.pbn >> 32) & 0x0F,
+		.pbn_low_word = __cpu_to_le32(entry->slot.pbn & UINT_MAX),
+		.mapping = vdo_pack_block_map_entry(entry->mapping.pbn, entry->mapping.state),
+		.unmapping = vdo_pack_block_map_entry(entry->unmapping.pbn,
+						      entry->unmapping.state),
+	};
+}
+
+/**
+ * vdo_unpack_recovery_journal_entry() - Unpack the on-disk representation of a recovery journal
+ *                                       entry.
+ * @entry: The recovery journal entry to unpack.
+ *
+ * Return: The unpacked entry.
+ */
+static inline struct recovery_journal_entry
+vdo_unpack_recovery_journal_entry(const struct packed_recovery_journal_entry *entry)
+{
+	physical_block_number_t low32 = __le32_to_cpu(entry->pbn_low_word);
+	physical_block_number_t high4 = entry->pbn_high_nibble;
+
+	return (struct recovery_journal_entry) {
+		.operation = entry->operation,
+		.slot = {
+			.pbn = ((high4 << 32) | low32),
+			.slot = (entry->slot_low | (entry->slot_high << 6)),
+		},
+		.mapping = vdo_unpack_block_map_entry(&entry->mapping),
+		.unmapping = vdo_unpack_block_map_entry(&entry->unmapping),
+	};
+}
+
+const char * __must_check vdo_get_journal_operation_name(enum journal_operation operation);
+
+/**
+ * vdo_is_valid_recovery_journal_sector() - Determine whether the header of the given sector could
+ *                                          describe a valid sector for the given journal block
+ *                                          header.
+ * @header: The unpacked block header to compare against.
+ * @sector: The packed sector to check.
+ * @sector_number: The number of the sector being checked.
+ *
+ * Return: true if the sector matches the block header.
+ */
+static inline bool __must_check
+vdo_is_valid_recovery_journal_sector(const struct recovery_block_header *header,
+				     const struct packed_journal_sector *sector,
+				     u8 sector_number)
+{
+	if ((header->check_byte != sector->check_byte) ||
+	    (header->recovery_count != sector->recovery_count))
+		return false;
+
+	if (header->metadata_type == VDO_METADATA_RECOVERY_JOURNAL_2)
+		return sector->entry_count <= RECOVERY_JOURNAL_ENTRIES_PER_SECTOR;
+
+	if (sector_number == 7)
+		return sector->entry_count <= RECOVERY_JOURNAL_1_ENTRIES_IN_LAST_SECTOR;
+
+	return sector->entry_count <= RECOVERY_JOURNAL_1_ENTRIES_PER_SECTOR;
+}
+
+/**
+ * vdo_compute_recovery_journal_block_number() - Compute the physical block number of the recovery
+ *                                               journal block which would have a given sequence
+ *                                               number.
+ * @journal_size: The size of the journal.
+ * @sequence_number: The sequence number.
+ *
+ * Return: The pbn of the journal block which would the specified sequence number.
+ */
+static inline physical_block_number_t __must_check
+vdo_compute_recovery_journal_block_number(block_count_t journal_size,
+					  sequence_number_t sequence_number)
+{
+	/*
+	 * Since journal size is a power of two, the block number modulus can just be extracted
+	 * from the low-order bits of the sequence.
+	 */
+	return (sequence_number & (journal_size - 1));
+}
+
+/**
+ * vdo_get_journal_block_sector() - Find the recovery journal sector from the block header and
+ *                                  sector number.
+ * @header: The header of the recovery journal block.
+ * @sector_number: The index of the sector (1-based).
+ *
+ * Return: A packed recovery journal sector.
+ */
+static inline struct packed_journal_sector * __must_check
+vdo_get_journal_block_sector(struct packed_journal_header *header, int sector_number)
+{
+	char *sector_data = ((char *) header) + (VDO_SECTOR_SIZE * sector_number);
+
+	return (struct packed_journal_sector *) sector_data;
+}
+
+/**
+ * vdo_pack_recovery_block_header() - Generate the packed representation of a recovery block
+ *                                    header.
+ * @header: The header containing the values to encode.
+ * @packed: The header into which to pack the values.
+ */
+static inline void vdo_pack_recovery_block_header(const struct recovery_block_header *header,
+						  struct packed_journal_header *packed)
+{
+	*packed = (struct packed_journal_header) {
+		.block_map_head = __cpu_to_le64(header->block_map_head),
+		.slab_journal_head = __cpu_to_le64(header->slab_journal_head),
+		.sequence_number = __cpu_to_le64(header->sequence_number),
+		.nonce = __cpu_to_le64(header->nonce),
+		.logical_blocks_used = __cpu_to_le64(header->logical_blocks_used),
+		.block_map_data_blocks = __cpu_to_le64(header->block_map_data_blocks),
+		.entry_count = __cpu_to_le16(header->entry_count),
+		.check_byte = header->check_byte,
+		.recovery_count = header->recovery_count,
+		.metadata_type = header->metadata_type,
+	};
+}
+
+/**
+ * vdo_unpack_recovery_block_header() - Decode the packed representation of a recovery block
+ *                                      header.
+ * @packed: The packed header to decode.
+ *
+ * Return: The unpacked header.
+ */
+static inline struct recovery_block_header
+vdo_unpack_recovery_block_header(const struct packed_journal_header *packed)
+{
+	return (struct recovery_block_header) {
+		.block_map_head = __le64_to_cpu(packed->block_map_head),
+		.slab_journal_head = __le64_to_cpu(packed->slab_journal_head),
+		.sequence_number = __le64_to_cpu(packed->sequence_number),
+		.nonce = __le64_to_cpu(packed->nonce),
+		.logical_blocks_used = __le64_to_cpu(packed->logical_blocks_used),
+		.block_map_data_blocks = __le64_to_cpu(packed->block_map_data_blocks),
+		.entry_count = __le16_to_cpu(packed->entry_count),
+		.check_byte = packed->check_byte,
+		.recovery_count = packed->recovery_count,
+		.metadata_type = packed->metadata_type,
+	};
+}
+
+/**
+ * vdo_compute_slab_count() - Compute the number of slabs a depot with given parameters would have.
+ * @first_block: PBN of the first data block.
+ * @last_block: PBN of the last data block.
+ * @slab_size_shift: Exponent for the number of blocks per slab.
+ *
+ * Return: The number of slabs.
+ */
+static inline slab_count_t
+vdo_compute_slab_count(physical_block_number_t first_block,
+		       physical_block_number_t last_block,
+		       unsigned int slab_size_shift)
+{
+	return (slab_count_t) ((last_block - first_block) >> slab_size_shift);
+}
+
+int __must_check vdo_configure_slab_depot(const struct partition *partition,
+					  struct slab_config slab_config,
+					  zone_count_t zone_count,
+					  struct slab_depot_state_2_0 *state);
+
+int __must_check vdo_configure_slab(block_count_t slab_size,
+				    block_count_t slab_journal_blocks,
+				    struct slab_config *slab_config);
+
+/**
+ * vdo_get_saved_reference_count_size() - Get the number of blocks required to save a reference
+ *                                        counts state covering the specified number of data
+ *                                        blocks.
+ * @block_count: The number of physical data blocks that can be referenced.
+ *
+ * Return: The number of blocks required to save reference counts with the given block count.
+ */
+static inline block_count_t
+vdo_get_saved_reference_count_size(block_count_t block_count)
+{
+	return DIV_ROUND_UP(block_count, COUNTS_PER_BLOCK);
+}
+
+/**
+ * vdo_get_slab_journal_start_block() - Get the physical block number of the start of the slab
+ *                                      journal relative to the start block allocator partition.
+ * @slab_config: The slab configuration of the VDO.
+ * @origin: The first block of the slab.
+ */
+static inline physical_block_number_t __must_check
+vdo_get_slab_journal_start_block(const struct slab_config *slab_config,
+				 physical_block_number_t origin)
+{
+	return origin + slab_config->data_blocks + slab_config->reference_count_blocks;
+}
+
+/**
+ * vdo_advance_journal_point() - Move the given journal point forward by one entry.
+ * @point: The journal point to adjust.
+ * @entries_per_block: The number of entries in one full block.
+ */
+static inline void
+vdo_advance_journal_point(struct journal_point *point, journal_entry_count_t entries_per_block)
+{
+	point->entry_count++;
+	if (point->entry_count == entries_per_block) {
+		point->sequence_number++;
+		point->entry_count = 0;
+	}
+}
+
+/**
+ * vdo_before_journal_point() - Check whether the first point precedes the second point.
+ * @first: The first journal point.
+ * @second: The second journal point.
+ *
+ * Return: true if the first point precedes the second point.
+ */
+static inline bool
+vdo_before_journal_point(const struct journal_point *first, const struct journal_point *second)
+{
+	return ((first->sequence_number < second->sequence_number) ||
+		((first->sequence_number == second->sequence_number) &&
+		 (first->entry_count < second->entry_count)));
+}
+
+/**
+ * vdo_pack_journal_point() - Encode the journal location represented by a
+ *                            journal_point into a packed_journal_point.
+ * @unpacked: The unpacked input point.
+ * @packed: The packed output point.
+ */
+static inline void
+vdo_pack_journal_point(const struct journal_point *unpacked, struct packed_journal_point *packed)
+{
+	packed->encoded_point =
+		__cpu_to_le64((unpacked->sequence_number << 16) | unpacked->entry_count);
+}
+
+/**
+ * vdo_unpack_journal_point() - Decode the journal location represented by a packed_journal_point
+ *                              into a journal_point.
+ * @packed: The packed input point.
+ * @unpacked: The unpacked output point.
+ */
+static inline void
+vdo_unpack_journal_point(const struct packed_journal_point *packed, struct journal_point *unpacked)
+{
+	u64 native = __le64_to_cpu(packed->encoded_point);
+
+	unpacked->sequence_number = (native >> 16);
+	unpacked->entry_count = (native & 0xffff);
+}
+
+/**
+ * vdo_pack_slab_journal_block_header() - Generate the packed representation of a slab block
+ *                                        header.
+ * @header: The header containing the values to encode.
+ * @packed: The header into which to pack the values.
+ */
+static inline void
+vdo_pack_slab_journal_block_header(const struct slab_journal_block_header *header,
+				   struct packed_slab_journal_block_header *packed)
+{
+	packed->head = __cpu_to_le64(header->head);
+	packed->sequence_number = __cpu_to_le64(header->sequence_number);
+	packed->nonce = __cpu_to_le64(header->nonce);
+	packed->entry_count = __cpu_to_le16(header->entry_count);
+	packed->metadata_type = header->metadata_type;
+	packed->has_block_map_increments = header->has_block_map_increments;
+
+	vdo_pack_journal_point(&header->recovery_point, &packed->recovery_point);
+}
+
+/**
+ * vdo_unpack_slab_journal_block_header() - Decode the packed representation of a slab block
+ *                                          header.
+ * @packed: The packed header to decode.
+ * @header: The header into which to unpack the values.
+ */
+static inline void
+vdo_unpack_slab_journal_block_header(const struct packed_slab_journal_block_header *packed,
+				     struct slab_journal_block_header *header)
+{
+	*header = (struct slab_journal_block_header) {
+		.head = __le64_to_cpu(packed->head),
+		.sequence_number = __le64_to_cpu(packed->sequence_number),
+		.nonce = __le64_to_cpu(packed->nonce),
+		.entry_count = __le16_to_cpu(packed->entry_count),
+		.metadata_type = packed->metadata_type,
+		.has_block_map_increments = packed->has_block_map_increments,
+	};
+	vdo_unpack_journal_point(&packed->recovery_point, &header->recovery_point);
+}
+
+/**
+ * vdo_pack_slab_journal_entry() - Generate the packed encoding of a slab journal entry.
+ * @packed: The entry into which to pack the values.
+ * @sbn: The slab block number of the entry to encode.
+ * @is_increment: The increment flag.
+ */
+static inline void vdo_pack_slab_journal_entry(packed_slab_journal_entry *packed,
+					       slab_block_number sbn,
+					       bool is_increment)
+{
+	packed->offset_low8 = (sbn & 0x0000FF);
+	packed->offset_mid8 = (sbn & 0x00FF00) >> 8;
+	packed->offset_high7 = (sbn & 0x7F0000) >> 16;
+	packed->increment = is_increment ? 1 : 0;
+}
+
+/**
+ * vdo_unpack_slab_journal_entry() - Decode the packed representation of a slab journal entry.
+ * @packed: The packed entry to decode.
+ *
+ * Return: The decoded slab journal entry.
+ */
+static inline struct slab_journal_entry __must_check
+vdo_unpack_slab_journal_entry(const packed_slab_journal_entry *packed)
+{
+	struct slab_journal_entry entry;
+
+	entry.sbn = packed->offset_high7;
+	entry.sbn <<= 8;
+	entry.sbn |= packed->offset_mid8;
+	entry.sbn <<= 8;
+	entry.sbn |= packed->offset_low8;
+	entry.operation = VDO_JOURNAL_DATA_REMAPPING;
+	entry.increment = packed->increment;
+	return entry;
+}
+
+struct slab_journal_entry __must_check
+vdo_decode_slab_journal_entry(struct packed_slab_journal_block *block,
+			      journal_entry_count_t entry_count);
+
+/**
+ * vdo_get_slab_summary_hint_shift() - Compute the shift for slab summary hints.
+ * @slab_size_shift: Exponent for the number of blocks per slab.
+ *
+ * Return: The hint shift.
+ */
+static inline u8 __must_check vdo_get_slab_summary_hint_shift(unsigned int slab_size_shift)
+{
+	return ((slab_size_shift > VDO_SLAB_SUMMARY_FULLNESS_HINT_BITS) ?
+		(slab_size_shift - VDO_SLAB_SUMMARY_FULLNESS_HINT_BITS) :
+		0);
+}
+
+int __must_check vdo_initialize_layout(block_count_t size,
+				       physical_block_number_t offset,
+				       block_count_t block_map_blocks,
+				       block_count_t journal_blocks,
+				       block_count_t summary_blocks,
+				       struct layout *layout);
+
+void vdo_uninitialize_layout(struct layout *layout);
+
+int __must_check vdo_get_partition(struct layout *layout,
+				   enum partition_id id,
+				   struct partition **partition_ptr);
+
+struct partition * __must_check
+vdo_get_known_partition(struct layout *layout, enum partition_id id);
+
+int vdo_validate_config(const struct vdo_config *config,
+			block_count_t physical_block_count,
+			block_count_t logical_block_count);
+
+void vdo_destroy_component_states(struct vdo_component_states *states);
+
+int __must_check
+vdo_decode_component_states(u8 *buffer,
+			    struct volume_geometry *geometry,
+			    struct vdo_component_states *states);
+
+int __must_check
+vdo_validate_component_states(struct vdo_component_states *states,
+			      nonce_t geometry_nonce,
+			      block_count_t physical_size,
+			      block_count_t logical_size);
+
+void vdo_encode_super_block(u8 *buffer, struct vdo_component_states *states);
+int __must_check vdo_decode_super_block(u8 *buffer);
+
+/* We start with 0L and postcondition with ~0L to match our historical usage in userspace. */
+static inline u32 vdo_crc32(const void *buf, unsigned long len)
+{
+	return (crc32(0L, buf, len) ^ ~0L);
+}
+
+#endif /* VDO_ENCODINGS_H */
diff --git a/drivers/md/dm-vdo/flush.c b/drivers/md/dm-vdo/flush.c
new file mode 100644
index 00000000000..e5088ac08a6
--- /dev/null
+++ b/drivers/md/dm-vdo/flush.c
@@ -0,0 +1,563 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "flush.h"
+
+#include <linux/mempool.h>
+#include <linux/spinlock.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "admin-state.h"
+#include "completion.h"
+#include "io-submitter.h"
+#include "logical-zone.h"
+#include "slab-depot.h"
+#include "types.h"
+#include "vdo.h"
+
+struct flusher {
+	struct vdo_completion completion;
+	/** The vdo to which this flusher belongs */
+	struct vdo *vdo;
+	/** The administrative state of the flusher */
+	struct admin_state state;
+	/** The current flush generation of the vdo */
+	sequence_number_t flush_generation;
+	/** The first unacknowledged flush generation */
+	sequence_number_t first_unacknowledged_generation;
+	/** The queue of flush requests waiting to notify other threads */
+	struct wait_queue notifiers;
+	/** The queue of flush requests waiting for VIOs to complete */
+	struct wait_queue pending_flushes;
+	/** The flush generation for which notifications are being sent */
+	sequence_number_t notify_generation;
+	/** The logical zone to notify next */
+	struct logical_zone *logical_zone_to_notify;
+	/** The ID of the thread on which flush requests should be made */
+	thread_id_t thread_id;
+	/** The pool of flush requests */
+	mempool_t *flush_pool;
+	/** Bios waiting for a flush request to become available */
+	struct bio_list waiting_flush_bios;
+	/** The lock to protect the previous fields */
+	spinlock_t lock;
+	/** The rotor for selecting the bio queue for submitting flush bios */
+	zone_count_t bio_queue_rotor;
+	/** The number of flushes submitted to the current bio queue */
+	int flush_count;
+};
+
+/**
+ * assert_on_flusher_thread() - Check that we are on the flusher thread.
+ * @flusher: The flusher.
+ * @caller: The function which is asserting.
+ */
+static inline void assert_on_flusher_thread(struct flusher *flusher, const char *caller)
+{
+	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == flusher->thread_id),
+			"%s() called from flusher thread",
+			caller);
+}
+
+/**
+ * as_flusher() - Convert a generic vdo_completion to a flusher.
+ * @completion: The completion to convert.
+ *
+ * Return: The completion as a flusher.
+ */
+static struct flusher *as_flusher(struct vdo_completion *completion)
+{
+	vdo_assert_completion_type(completion, VDO_FLUSH_NOTIFICATION_COMPLETION);
+	return container_of(completion, struct flusher, completion);
+}
+
+/**
+ * completion_as_vdo_flush() - Convert a generic vdo_completion to a vdo_flush.
+ * @completion: The completion to convert.
+ *
+ * Return: The completion as a vdo_flush.
+ */
+static inline struct vdo_flush *completion_as_vdo_flush(struct vdo_completion *completion)
+{
+	vdo_assert_completion_type(completion, VDO_FLUSH_COMPLETION);
+	return container_of(completion, struct vdo_flush, completion);
+}
+
+/**
+ * waiter_as_flush() - Convert a vdo_flush's generic wait queue entry back to the vdo_flush.
+ * @waiter: The wait queue entry to convert.
+ *
+ * Return: The wait queue entry as a vdo_flush.
+ */
+static struct vdo_flush *waiter_as_flush(struct waiter *waiter)
+{
+	return container_of(waiter, struct vdo_flush, waiter);
+}
+
+static void *allocate_flush(gfp_t gfp_mask, void *pool_data)
+{
+	struct vdo_flush *flush;
+
+	if ((gfp_mask & GFP_NOWAIT) == GFP_NOWAIT) {
+		flush = UDS_ALLOCATE_NOWAIT(struct vdo_flush, __func__);
+	} else {
+		int result = UDS_ALLOCATE(1, struct vdo_flush, __func__, &flush);
+
+		if (result != VDO_SUCCESS)
+			uds_log_error_strerror(result, "failed to allocate spare flush");
+	}
+
+	if (flush != NULL) {
+		struct flusher *flusher = pool_data;
+
+		vdo_initialize_completion(&flush->completion, flusher->vdo, VDO_FLUSH_COMPLETION);
+	}
+
+	return flush;
+}
+
+static void free_flush(void *element, void *pool_data __always_unused)
+{
+	UDS_FREE(element);
+}
+
+/**
+ * vdo_make_flusher() - Make a flusher for a vdo.
+ * @vdo: The vdo which owns the flusher.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_make_flusher(struct vdo *vdo)
+{
+	int result = UDS_ALLOCATE(1, struct flusher, __func__, &vdo->flusher);
+
+	if (result != VDO_SUCCESS)
+		return result;
+
+	vdo->flusher->vdo = vdo;
+	vdo->flusher->thread_id = vdo->thread_config.packer_thread;
+	vdo_set_admin_state_code(&vdo->flusher->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+	vdo_initialize_completion(&vdo->flusher->completion, vdo,
+				  VDO_FLUSH_NOTIFICATION_COMPLETION);
+
+	spin_lock_init(&vdo->flusher->lock);
+	bio_list_init(&vdo->flusher->waiting_flush_bios);
+	vdo->flusher->flush_pool = mempool_create(1, allocate_flush, free_flush, vdo->flusher);
+	return ((vdo->flusher->flush_pool == NULL) ? -ENOMEM : VDO_SUCCESS);
+}
+
+/**
+ * vdo_free_flusher() - Free a flusher.
+ * @flusher: The flusher to free.
+ */
+void vdo_free_flusher(struct flusher *flusher)
+{
+	if (flusher == NULL)
+		return;
+
+	if (flusher->flush_pool != NULL)
+		mempool_destroy(UDS_FORGET(flusher->flush_pool));
+	UDS_FREE(flusher);
+}
+
+/**
+ * vdo_get_flusher_thread_id() - Get the ID of the thread on which flusher functions should be
+ *                               called.
+ * @flusher: The flusher to query.
+ *
+ * Return: The ID of the thread which handles the flusher.
+ */
+thread_id_t vdo_get_flusher_thread_id(struct flusher *flusher)
+{
+	return flusher->thread_id;
+}
+
+static void notify_flush(struct flusher *flusher);
+static void vdo_complete_flush(struct vdo_flush *flush);
+
+/**
+ * finish_notification() - Finish the notification process.
+ * @completion: The flusher completion.
+ *
+ * Finishes the notification process by checking if any flushes have completed and then starting
+ * the notification of the next flush request if one came in while the current notification was in
+ * progress. This callback is registered in flush_packer_callback().
+ */
+static void finish_notification(struct vdo_completion *completion)
+{
+	struct flusher *flusher = as_flusher(completion);
+
+	assert_on_flusher_thread(flusher, __func__);
+
+	vdo_enqueue_waiter(&flusher->pending_flushes,
+			   vdo_dequeue_next_waiter(&flusher->notifiers));
+	vdo_complete_flushes(flusher);
+	if (vdo_has_waiters(&flusher->notifiers))
+		notify_flush(flusher);
+}
+
+/**
+ * flush_packer_callback() - Flush the packer.
+ * @completion: The flusher completion.
+ *
+ * Flushes the packer now that all of the logical and physical zones have been notified of the new
+ * flush request. This callback is registered in increment_generation().
+ */
+static void flush_packer_callback(struct vdo_completion *completion)
+{
+	struct flusher *flusher = as_flusher(completion);
+
+	vdo_increment_packer_flush_generation(flusher->vdo->packer);
+	vdo_launch_completion_callback(completion, finish_notification, flusher->thread_id);
+}
+
+/**
+ * increment_generation() - Increment the flush generation in a logical zone.
+ * @completion: The flusher as a completion.
+ *
+ * If there are more logical zones, go on to the next one, otherwise, prepare the physical zones.
+ * This callback is registered both in notify_flush() and in itself.
+ */
+static void increment_generation(struct vdo_completion *completion)
+{
+	struct flusher *flusher = as_flusher(completion);
+	struct logical_zone *zone = flusher->logical_zone_to_notify;
+
+	vdo_increment_logical_zone_flush_generation(zone, flusher->notify_generation);
+	if (zone->next == NULL) {
+		vdo_launch_completion_callback(completion,
+					       flush_packer_callback,
+					       flusher->thread_id);
+		return;
+	}
+
+	flusher->logical_zone_to_notify = zone->next;
+	vdo_launch_completion_callback(completion,
+				       increment_generation,
+				       flusher->logical_zone_to_notify->thread_id);
+}
+
+/**
+ * notify_flush() - Launch a flush notification.
+ * @flusher: The flusher doing the notification.
+ */
+static void notify_flush(struct flusher *flusher)
+{
+	struct vdo_flush *flush = waiter_as_flush(vdo_get_first_waiter(&flusher->notifiers));
+
+	flusher->notify_generation = flush->flush_generation;
+	flusher->logical_zone_to_notify = &flusher->vdo->logical_zones->zones[0];
+	flusher->completion.requeue = true;
+	vdo_launch_completion_callback(&flusher->completion,
+				       increment_generation,
+				       flusher->logical_zone_to_notify->thread_id);
+}
+
+/**
+ * flush_vdo() - Start processing a flush request.
+ * @completion: A flush request (as a vdo_completion)
+ *
+ * This callback is registered in launch_flush().
+ */
+static void flush_vdo(struct vdo_completion *completion)
+{
+	struct vdo_flush *flush = completion_as_vdo_flush(completion);
+	struct flusher *flusher = completion->vdo->flusher;
+	bool may_notify;
+	int result;
+
+	assert_on_flusher_thread(flusher, __func__);
+	result = ASSERT(vdo_is_state_normal(&flusher->state), "flusher is in normal operation");
+	if (result != VDO_SUCCESS) {
+		vdo_enter_read_only_mode(flusher->vdo, result);
+		vdo_complete_flush(flush);
+		return;
+	}
+
+	flush->flush_generation = flusher->flush_generation++;
+	may_notify = !vdo_has_waiters(&flusher->notifiers);
+	vdo_enqueue_waiter(&flusher->notifiers, &flush->waiter);
+	if (may_notify)
+		notify_flush(flusher);
+}
+
+/**
+ * check_for_drain_complete() - Check whether the flusher has drained.
+ * @flusher: The flusher.
+ */
+static void check_for_drain_complete(struct flusher *flusher)
+{
+	bool drained;
+
+	if (!vdo_is_state_draining(&flusher->state) || vdo_has_waiters(&flusher->pending_flushes))
+		return;
+
+	spin_lock(&flusher->lock);
+	drained = bio_list_empty(&flusher->waiting_flush_bios);
+	spin_unlock(&flusher->lock);
+
+	if (drained)
+		vdo_finish_draining(&flusher->state);
+}
+
+/**
+ * vdo_complete_flushes() - Attempt to complete any flushes which might have finished.
+ * @flusher: The flusher.
+ */
+void vdo_complete_flushes(struct flusher *flusher)
+{
+	sequence_number_t oldest_active_generation = U64_MAX;
+	struct logical_zone *zone;
+
+	assert_on_flusher_thread(flusher, __func__);
+
+	for (zone = &flusher->vdo->logical_zones->zones[0]; zone != NULL; zone = zone->next)
+		oldest_active_generation =
+			min(oldest_active_generation, READ_ONCE(zone->oldest_active_generation));
+
+	while (vdo_has_waiters(&flusher->pending_flushes)) {
+		struct vdo_flush *flush =
+			waiter_as_flush(vdo_get_first_waiter(&flusher->pending_flushes));
+
+		if (flush->flush_generation >= oldest_active_generation)
+			return;
+
+		ASSERT_LOG_ONLY((flush->flush_generation ==
+				 flusher->first_unacknowledged_generation),
+				"acknowledged next expected flush, %llu, was: %llu",
+				(unsigned long long) flusher->first_unacknowledged_generation,
+				(unsigned long long) flush->flush_generation);
+		vdo_dequeue_next_waiter(&flusher->pending_flushes);
+		vdo_complete_flush(flush);
+		flusher->first_unacknowledged_generation++;
+	}
+
+	check_for_drain_complete(flusher);
+}
+
+/**
+ * vdo_dump_flusher() - Dump the flusher, in a thread-unsafe fashion.
+ * @flusher: The flusher.
+ */
+void vdo_dump_flusher(const struct flusher *flusher)
+{
+	uds_log_info("struct flusher");
+	uds_log_info("  flush_generation=%llu first_unacknowledged_generation=%llu",
+		     (unsigned long long) flusher->flush_generation,
+		     (unsigned long long) flusher->first_unacknowledged_generation);
+	uds_log_info("  notifiers queue is %s; pending_flushes queue is %s",
+		     (vdo_has_waiters(&flusher->notifiers) ? "not empty" : "empty"),
+		     (vdo_has_waiters(&flusher->pending_flushes) ? "not empty" : "empty"));
+}
+
+/**
+ * initialize_flush() - Initialize a vdo_flush structure.
+ * @flush: The flush to initialize.
+ * @vdo: The vdo being flushed.
+ *
+ * Initializes a vdo_flush structure, transferring all the bios in the flusher's waiting_flush_bios
+ * list to it. The caller MUST already hold the lock.
+ */
+static void initialize_flush(struct vdo_flush *flush, struct vdo *vdo)
+{
+	bio_list_init(&flush->bios);
+	bio_list_merge(&flush->bios, &vdo->flusher->waiting_flush_bios);
+	bio_list_init(&vdo->flusher->waiting_flush_bios);
+}
+
+static void launch_flush(struct vdo_flush *flush)
+{
+	struct vdo_completion *completion = &flush->completion;
+
+	vdo_prepare_completion(completion,
+			       flush_vdo,
+			       flush_vdo,
+			       completion->vdo->thread_config.packer_thread,
+			       NULL);
+	vdo_enqueue_completion(completion, VDO_DEFAULT_Q_FLUSH_PRIORITY);
+}
+
+/**
+ * vdo_launch_flush() - Function called to start processing a flush request.
+ * @vdo: The vdo.
+ * @bio: The bio containing an empty flush request.
+ *
+ * This is called when we receive an empty flush bio from the block layer, and before acknowledging
+ * a non-empty bio with the FUA flag set.
+ */
+void vdo_launch_flush(struct vdo *vdo, struct bio *bio)
+{
+	/*
+	 * Try to allocate a vdo_flush to represent the flush request. If the allocation fails,
+	 * we'll deal with it later.
+	 */
+	struct vdo_flush *flush = mempool_alloc(vdo->flusher->flush_pool, GFP_NOWAIT);
+	struct flusher *flusher = vdo->flusher;
+	const struct admin_state_code *code = vdo_get_admin_state_code(&flusher->state);
+
+	ASSERT_LOG_ONLY(!code->quiescent, "Flushing not allowed in state %s", code->name);
+
+	spin_lock(&flusher->lock);
+
+	/* We have a new bio to start. Add it to the list. */
+	bio_list_add(&flusher->waiting_flush_bios, bio);
+
+	if (flush == NULL) {
+		spin_unlock(&flusher->lock);
+		return;
+	}
+
+	/* We have flushes to start. Capture them in the vdo_flush structure. */
+	initialize_flush(flush, vdo);
+	spin_unlock(&flusher->lock);
+
+	/* Finish launching the flushes. */
+	launch_flush(flush);
+}
+
+/**
+ * release_flush() - Release a vdo_flush structure that has completed its work.
+ * @flush: The completed flush structure to re-use or free.
+ *
+ * If there are any pending flush requests whose vdo_flush allocation failed, they will be launched
+ * by immediately re-using the released vdo_flush. If there is no spare vdo_flush, the released
+ * structure will become the spare. Otherwise, the vdo_flush will be freed.
+ */
+static void release_flush(struct vdo_flush *flush)
+{
+	bool relaunch_flush;
+	struct flusher *flusher = flush->completion.vdo->flusher;
+
+	spin_lock(&flusher->lock);
+	if (bio_list_empty(&flusher->waiting_flush_bios)) {
+		relaunch_flush = false;
+	} else {
+		/* We have flushes to start. Capture them in a flush request. */
+		initialize_flush(flush, flusher->vdo);
+		relaunch_flush = true;
+	}
+	spin_unlock(&flusher->lock);
+
+	if (relaunch_flush) {
+		/* Finish launching the flushes. */
+		launch_flush(flush);
+		return;
+	}
+
+	mempool_free(flush, flusher->flush_pool);
+}
+
+/**
+ * vdo_complete_flush_callback() - Function called to complete and free a flush request, registered
+ *                                 in vdo_complete_flush().
+ * @completion: The flush request.
+ */
+static void vdo_complete_flush_callback(struct vdo_completion *completion)
+{
+	struct vdo_flush *flush = completion_as_vdo_flush(completion);
+	struct vdo *vdo = completion->vdo;
+	struct bio *bio;
+
+	while ((bio = bio_list_pop(&flush->bios)) != NULL) {
+		/*
+		 * We're not acknowledging this bio now, but we'll never touch it again, so this is
+		 * the last chance to account for it.
+		 */
+		vdo_count_bios(&vdo->stats.bios_acknowledged, bio);
+
+		/* Update the device, and send it on down... */
+		bio_set_dev(bio, vdo_get_backing_device(vdo));
+		atomic64_inc(&vdo->stats.flush_out);
+		submit_bio_noacct(bio);
+	}
+
+
+	/*
+	 * Release the flush structure, freeing it, re-using it as the spare, or using it to launch
+	 * any flushes that had to wait when allocations failed.
+	 */
+	release_flush(flush);
+}
+
+/**
+ * select_bio_queue() - Select the bio queue on which to finish a flush request.
+ * @flusher: The flusher finishing the request.
+ */
+static thread_id_t select_bio_queue(struct flusher *flusher)
+{
+	struct vdo *vdo = flusher->vdo;
+	zone_count_t bio_threads = flusher->vdo->thread_config.bio_thread_count;
+	int interval;
+
+	if (bio_threads == 1)
+		return vdo->thread_config.bio_threads[0];
+
+	interval = vdo->device_config->thread_counts.bio_rotation_interval;
+	if (flusher->flush_count == interval) {
+		flusher->flush_count = 1;
+		flusher->bio_queue_rotor = ((flusher->bio_queue_rotor + 1) % bio_threads);
+	} else {
+		flusher->flush_count++;
+	}
+
+	return vdo->thread_config.bio_threads[flusher->bio_queue_rotor];
+}
+
+/**
+ * vdo_complete_flush() - Complete and free a vdo flush request.
+ * @flush: The flush request.
+ */
+static void vdo_complete_flush(struct vdo_flush *flush)
+{
+	struct vdo_completion *completion = &flush->completion;
+
+	vdo_prepare_completion(completion,
+			       vdo_complete_flush_callback,
+			       vdo_complete_flush_callback,
+			       select_bio_queue(completion->vdo->flusher),
+			       NULL);
+	vdo_enqueue_completion(completion, BIO_Q_FLUSH_PRIORITY);
+}
+
+/**
+ * initiate_drain() - Initiate a drain.
+ *
+ * Implements vdo_admin_initiator.
+ */
+static void initiate_drain(struct admin_state *state)
+{
+	check_for_drain_complete(container_of(state, struct flusher, state));
+}
+
+/**
+ * vdo_drain_flusher() - Drain the flusher.
+ * @flusher: The flusher to drain.
+ * @completion: The completion to finish when the flusher has drained.
+ *
+ * Drains the flusher by preventing any more VIOs from entering the flusher and then flushing. The
+ * flusher will be left in the suspended state.
+ */
+void vdo_drain_flusher(struct flusher *flusher, struct vdo_completion *completion)
+{
+	assert_on_flusher_thread(flusher, __func__);
+	vdo_start_draining(&flusher->state,
+			   VDO_ADMIN_STATE_SUSPENDING,
+			   completion,
+			   initiate_drain);
+}
+
+/**
+ * vdo_resume_flusher() - Resume a flusher which has been suspended.
+ * @flusher: The flusher to resume.
+ * @parent: The completion to finish when the flusher has resumed.
+ */
+void vdo_resume_flusher(struct flusher *flusher, struct vdo_completion *parent)
+{
+	assert_on_flusher_thread(flusher, __func__);
+	vdo_continue_completion(parent, vdo_resume_if_quiescent(&flusher->state));
+}
diff --git a/drivers/md/dm-vdo/flush.h b/drivers/md/dm-vdo/flush.h
new file mode 100644
index 00000000000..2e66a477d5d
--- /dev/null
+++ b/drivers/md/dm-vdo/flush.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_FLUSH_H
+#define VDO_FLUSH_H
+
+#include "types.h"
+#include "vio.h"
+#include "wait-queue.h"
+#include "work-queue.h"
+
+/* A marker for tracking which journal entries are affected by a flush request. */
+struct vdo_flush {
+	/* The completion for enqueueing this flush request. */
+	struct vdo_completion completion;
+	/* The flush bios covered by this request */
+	struct bio_list bios;
+	/* The wait queue entry for this flush */
+	struct waiter waiter;
+	/* Which flush this struct represents */
+	sequence_number_t flush_generation;
+};
+
+struct flusher;
+
+int __must_check vdo_make_flusher(struct vdo *vdo);
+
+void vdo_free_flusher(struct flusher *flusher);
+
+thread_id_t __must_check vdo_get_flusher_thread_id(struct flusher *flusher);
+
+void vdo_complete_flushes(struct flusher *flusher);
+
+void vdo_dump_flusher(const struct flusher *flusher);
+
+void vdo_launch_flush(struct vdo *vdo, struct bio *bio);
+
+void vdo_drain_flusher(struct flusher *flusher, struct vdo_completion *completion);
+
+void vdo_resume_flusher(struct flusher *flusher, struct vdo_completion *parent);
+
+#endif /* VDO_FLUSH_H */
diff --git a/drivers/md/dm-vdo/int-map.c b/drivers/md/dm-vdo/int-map.c
new file mode 100644
index 00000000000..9beb9642ae1
--- /dev/null
+++ b/drivers/md/dm-vdo/int-map.c
@@ -0,0 +1,710 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+/**
+ * DOC:
+ *
+ * Hash table implementation of a map from integers to pointers, implemented using the Hopscotch
+ * Hashing algorithm by Herlihy, Shavit, and Tzafrir (see
+ * http://en.wikipedia.org/wiki/Hopscotch_hashing). This implementation does not contain any of the
+ * locking/concurrency features of the algorithm, just the collision resolution scheme.
+ *
+ * Hopscotch Hashing is based on hashing with open addressing and linear probing. All the entries
+ * are stored in a fixed array of buckets, with no dynamic allocation for collisions. Unlike linear
+ * probing, all the entries that hash to a given bucket are stored within a fixed neighborhood
+ * starting at that bucket. Chaining is effectively represented as a bit vector relative to each
+ * bucket instead of as pointers or explicit offsets.
+ *
+ * When an empty bucket cannot be found within a given neighborhood, subsequent neighborhoods are
+ * searched, and one or more entries will "hop" into those neighborhoods. When this process works,
+ * an empty bucket will move into the desired neighborhood, allowing the entry to be added. When
+ * that process fails (typically when the buckets are around 90% full), the table must be resized
+ * and the all entries rehashed and added to the expanded table.
+ *
+ * Unlike linear probing, the number of buckets that must be searched in the worst case has a fixed
+ * upper bound (the size of the neighborhood). Those entries occupy a small number of memory cache
+ * lines, leading to improved use of the cache (fewer misses on both successful and unsuccessful
+ * searches). Hopscotch hashing outperforms linear probing at much higher load factors, so even
+ * with the increased memory burden for maintaining the hop vectors, less memory is needed to
+ * achieve that performance. Hopscotch is also immune to "contamination" from deleting entries
+ * since entries are genuinely removed instead of being replaced by a placeholder.
+ *
+ * The published description of the algorithm used a bit vector, but the paper alludes to an offset
+ * scheme which is used by this implementation. Since the entries in the neighborhood are within N
+ * entries of the hash bucket at the start of the neighborhood, a pair of small offset fields each
+ * log2(N) bits wide is all that's needed to maintain the hops as a linked list. In order to encode
+ * "no next hop" (i.e. NULL) as the natural initial value of zero, the offsets are biased by one
+ * (i.e. 0 => NULL, 1 => offset=0, 2 => offset=1, etc.) We can represent neighborhoods of up to 255
+ * entries with just 8+8=16 bits per entry. The hop list is sorted by hop offset so the first entry
+ * in the list is always the bucket closest to the start of the neighborhood.
+ *
+ * While individual accesses tend to be very fast, the table resize operations are very, very
+ * expensive. If an upper bound on the latency of adding an entry to the table is needed, we either
+ * need to ensure the table is pre-sized to be large enough so no resize is ever needed, or we'll
+ * need to develop an approach to incrementally resize the table.
+ */
+
+#include "int-map.h"
+
+#include <linux/minmax.h>
+
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+
+enum {
+	DEFAULT_CAPACITY = 16, /* the number of neighborhoods in a new table */
+	NEIGHBORHOOD = 255,    /* the number of buckets in each neighborhood */
+	MAX_PROBES = 1024,     /* limit on the number of probes for a free bucket */
+	NULL_HOP_OFFSET = 0,   /* the hop offset value terminating the hop list */
+	DEFAULT_LOAD = 75      /* a compromise between memory use and performance */
+};
+
+/**
+ * struct bucket - hash bucket
+ *
+ * Buckets are packed together to reduce memory usage and improve cache efficiency. It would be
+ * tempting to encode the hop offsets separately and maintain alignment of key/value pairs, but
+ * it's crucial to keep the hop fields near the buckets that they use them so they'll tend to share
+ * cache lines.
+ */
+struct __packed bucket {
+	/**
+	 * @first_hop: The biased offset of the first entry in the hop list of the neighborhood
+	 *             that hashes to this bucket.
+	 */
+	u8 first_hop;
+	/** @next_hop: The biased offset of the next bucket in the hop list. */
+	u8 next_hop;
+	/** @key: The key stored in this bucket. */
+	u64 key;
+	/** @value: The value stored in this bucket (NULL if empty). */
+	void *value;
+};
+
+/**
+ * struct int_map - The concrete definition of the opaque int_map type.
+ *
+ * To avoid having to wrap the neighborhoods of the last entries back around to the start of the
+ * bucket array, we allocate a few more buckets at the end of the array instead, which is why
+ * capacity and bucket_count are different.
+ */
+struct int_map {
+	/** @size: The number of entries stored in the map. */
+	size_t size;
+	/** @capacity: The number of neighborhoods in the map. */
+	size_t capacity;
+	/* @bucket_count: The number of buckets in the bucket array. */
+	size_t bucket_count;
+	/** @buckets: The array of hash buckets. */
+	struct bucket *buckets;
+};
+
+/**
+ * mix() - The Google CityHash 16-byte hash mixing function.
+ * @input1: The first input value.
+ * @input2: The second input value.
+ *
+ * Return: A hash of the two inputs.
+ */
+static u64 mix(u64 input1, u64 input2)
+{
+	static const u64 CITY_MULTIPLIER = 0x9ddfea08eb382d69ULL;
+	u64 hash = (input1 ^ input2);
+
+	hash *= CITY_MULTIPLIER;
+	hash ^= (hash >> 47);
+	hash ^= input2;
+	hash *= CITY_MULTIPLIER;
+	hash ^= (hash >> 47);
+	hash *= CITY_MULTIPLIER;
+	return hash;
+}
+
+/**
+ * hash_key() - Calculate a 64-bit non-cryptographic hash value for the provided 64-bit integer
+ *              key.
+ * @key: The mapping key.
+ *
+ * The implementation is based on Google's CityHash, only handling the specific case of an 8-byte
+ * input.
+ *
+ * Return: The hash of the mapping key.
+ */
+static u64 hash_key(u64 key)
+{
+	/*
+	 * Aliasing restrictions forbid us from casting pointer types, so use a union to convert a
+	 * single u64 to two u32 values.
+	 */
+	union {
+		u64 u64;
+		u32 u32[2];
+	} pun = {.u64 = key};
+
+	return mix(sizeof(key) + (((u64) pun.u32[0]) << 3), pun.u32[1]);
+}
+
+/**
+ * allocate_buckets() - Initialize an int_map.
+ * @map: The map to initialize.
+ * @capacity: The initial capacity of the map.
+ *
+ * Return: UDS_SUCCESS or an error code.
+ */
+static int allocate_buckets(struct int_map *map, size_t capacity)
+{
+	map->size = 0;
+	map->capacity = capacity;
+
+	/*
+	 * Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a full neighborhood
+	 * without have to wrap back around to element zero.
+	 */
+	map->bucket_count = capacity + (NEIGHBORHOOD - 1);
+	return UDS_ALLOCATE(map->bucket_count, struct bucket,
+			    "struct int_map buckets", &map->buckets);
+}
+
+/**
+ * vdo_make_int_map() - Allocate and initialize an int_map.
+ * @initial_capacity: The number of entries the map should initially be capable of holding (zero
+ *                    tells the map to use its own small default).
+ * @initial_load: The load factor of the map, expressed as an integer percentage (typically in the
+ *                range 50 to 90, with zero telling the map to use its own default).
+ * @map_ptr: Output, a pointer to hold the new int_map.
+ *
+ * Return: UDS_SUCCESS or an error code.
+ */
+int vdo_make_int_map(size_t initial_capacity, unsigned int initial_load, struct int_map **map_ptr)
+{
+	struct int_map *map;
+	int result;
+	size_t capacity;
+
+	/* Use the default initial load if the caller did not specify one. */
+	if (initial_load == 0)
+		initial_load = DEFAULT_LOAD;
+	if (initial_load > 100)
+		return UDS_INVALID_ARGUMENT;
+
+	result = UDS_ALLOCATE(1, struct int_map, "struct int_map", &map);
+	if (result != UDS_SUCCESS)
+		return result;
+
+	/* Use the default capacity if the caller did not specify one. */
+	capacity = (initial_capacity > 0) ? initial_capacity : DEFAULT_CAPACITY;
+
+	/*
+	 * Scale up the capacity by the specified initial load factor. (i.e to hold 1000 entries at
+	 * 80% load we need a capacity of 1250)
+	 */
+	capacity = capacity * 100 / initial_load;
+
+	result = allocate_buckets(map, capacity);
+	if (result != UDS_SUCCESS) {
+		vdo_free_int_map(UDS_FORGET(map));
+		return result;
+	}
+
+	*map_ptr = map;
+	return UDS_SUCCESS;
+}
+
+/**
+ * vdo_free_int_map() - Free an int_map.
+ * @map: The int_map to free.
+ *
+ * NOTE: The map does not own the pointer values stored in the map and they are not freed by this
+ * call.
+ */
+void vdo_free_int_map(struct int_map *map)
+{
+	if (map == NULL)
+		return;
+
+	UDS_FREE(UDS_FORGET(map->buckets));
+	UDS_FREE(UDS_FORGET(map));
+}
+
+/**
+ * vdo_int_map_size() - Get the number of entries stored in an int_map.
+ * @map: The int_map to query.
+ *
+ * Return: The number of entries in the map.
+ */
+size_t vdo_int_map_size(const struct int_map *map)
+{
+	return map->size;
+}
+
+/**
+ * dereference_hop() - Convert a biased hop offset within a neighborhood to a pointer to the bucket
+ *                     it references.
+ * @neighborhood: The first bucket in the neighborhood.
+ * @hop_offset: The biased hop offset to the desired bucket.
+ *
+ * Return: NULL if hop_offset is zero, otherwise a pointer to the bucket in the neighborhood at
+ *         hop_offset - 1.
+ */
+static struct bucket *dereference_hop(struct bucket *neighborhood, unsigned int hop_offset)
+{
+	if (hop_offset == NULL_HOP_OFFSET)
+		return NULL;
+
+	STATIC_ASSERT(NULL_HOP_OFFSET == 0);
+	return &neighborhood[hop_offset - 1];
+}
+
+/**
+ * insert_in_hop_list() - Add a bucket into the hop list for the neighborhood.
+ * @neighborhood: The first bucket in the neighborhood.
+ * @new_bucket: The bucket to add to the hop list.
+ *
+ * The bucket is inserted it into the list so the hop list remains sorted by hop offset.
+ */
+static void insert_in_hop_list(struct bucket *neighborhood, struct bucket *new_bucket)
+{
+	/* Zero indicates a NULL hop offset, so bias the hop offset by one. */
+	int hop_offset = 1 + (new_bucket - neighborhood);
+
+	/* Handle the special case of adding a bucket at the start of the list. */
+	int next_hop = neighborhood->first_hop;
+
+	if ((next_hop == NULL_HOP_OFFSET) || (next_hop > hop_offset)) {
+		new_bucket->next_hop = next_hop;
+		neighborhood->first_hop = hop_offset;
+		return;
+	}
+
+	/* Search the hop list for the insertion point that maintains the sort order. */
+	for (;;) {
+		struct bucket *bucket = dereference_hop(neighborhood, next_hop);
+
+		next_hop = bucket->next_hop;
+
+		if ((next_hop == NULL_HOP_OFFSET) || (next_hop > hop_offset)) {
+			new_bucket->next_hop = next_hop;
+			bucket->next_hop = hop_offset;
+			return;
+		}
+	}
+}
+
+/**
+ * select_bucket() - Select and return the hash bucket for a given search key.
+ * @map: The map to search.
+ * @key: The mapping key.
+ */
+static struct bucket *select_bucket(const struct int_map *map, u64 key)
+{
+	/*
+	 * Calculate a good hash value for the provided key. We want exactly 32 bits, so mask the
+	 * result.
+	 */
+	u64 hash = hash_key(key) & 0xFFFFFFFF;
+
+	/*
+	 * Scale the 32-bit hash to a bucket index by treating it as a binary fraction and
+	 * multiplying that by the capacity. If the hash is uniformly distributed over [0 ..
+	 * 2^32-1], then (hash * capacity / 2^32) should be uniformly distributed over [0 ..
+	 * capacity-1]. The multiply and shift is much faster than a divide (modulus) on X86 CPUs.
+	 */
+	return &map->buckets[(hash * map->capacity) >> 32];
+}
+
+/**
+ * search_hop_list() - Search the hop list associated with given hash bucket for a given search
+ *                     key.
+ * @map: The map being searched.
+ * @bucket: The map bucket to search for the key.
+ * @key: The mapping key.
+ * @previous_ptr: Output. if not NULL, a pointer in which to store the bucket in the list preceding
+ *                the one that had the matching key
+ *
+ * If the key is found, returns a pointer to the entry (bucket or collision), otherwise returns
+ * NULL.
+ *
+ * Return: An entry that matches the key, or NULL if not found.
+ */
+static struct bucket *search_hop_list(struct int_map *map __always_unused,
+				      struct bucket *bucket,
+				      u64 key,
+				      struct bucket **previous_ptr)
+{
+	struct bucket *previous = NULL;
+	unsigned int next_hop = bucket->first_hop;
+
+	while (next_hop != NULL_HOP_OFFSET) {
+		/*
+		 * Check the neighboring bucket indexed by the offset for the
+		 * desired key.
+		 */
+		struct bucket *entry = dereference_hop(bucket, next_hop);
+
+		if ((key == entry->key) && (entry->value != NULL)) {
+			if (previous_ptr != NULL)
+				*previous_ptr = previous;
+			return entry;
+		}
+		next_hop = entry->next_hop;
+		previous = entry;
+	}
+	return NULL;
+}
+
+/**
+ * vdo_int_map_get() - Retrieve the value associated with a given key from the int_map.
+ * @map: The int_map to query.
+ * @key: The key to look up.
+ *
+ * Return: The value associated with the given key, or NULL if the key is not mapped to any value.
+ */
+void *vdo_int_map_get(struct int_map *map, u64 key)
+{
+	struct bucket *match = search_hop_list(map, select_bucket(map, key), key, NULL);
+
+	return ((match != NULL) ? match->value : NULL);
+}
+
+/**
+ * resize_buckets() - Increase the number of hash buckets.
+ * @map: The map to resize.
+ *
+ * Resizes and rehashes all the existing entries, storing them in the new buckets.
+ *
+ * Return: UDS_SUCCESS or an error code.
+ */
+static int resize_buckets(struct int_map *map)
+{
+	int result;
+	size_t i;
+
+	/* Copy the top-level map data to the stack. */
+	struct int_map old_map = *map;
+
+	/* Re-initialize the map to be empty and 50% larger. */
+	size_t new_capacity = map->capacity / 2 * 3;
+
+	uds_log_info("%s: attempting resize from %zu to %zu, current size=%zu",
+		     __func__, map->capacity, new_capacity, map->size);
+	result = allocate_buckets(map, new_capacity);
+	if (result != UDS_SUCCESS) {
+		*map = old_map;
+		return result;
+	}
+
+	/* Populate the new hash table from the entries in the old bucket array. */
+	for (i = 0; i < old_map.bucket_count; i++) {
+		struct bucket *entry = &old_map.buckets[i];
+
+		if (entry->value == NULL)
+			continue;
+
+		result = vdo_int_map_put(map, entry->key, entry->value, true, NULL);
+		if (result != UDS_SUCCESS) {
+			/* Destroy the new partial map and restore the map from the stack. */
+			UDS_FREE(UDS_FORGET(map->buckets));
+			*map = old_map;
+			return result;
+		}
+	}
+
+	/* Destroy the old bucket array. */
+	UDS_FREE(UDS_FORGET(old_map.buckets));
+	return UDS_SUCCESS;
+}
+
+/**
+ * find_empty_bucket() - Probe the bucket array starting at the given bucket for the next empty
+ *                       bucket, returning a pointer to it.
+ * @map: The map containing the buckets to search.
+ * @bucket: The bucket at which to start probing.
+ * @max_probes: The maximum number of buckets to search.
+ *
+ * NULL will be returned if the search reaches the end of the bucket array or if the number of
+ * linear probes exceeds a specified limit.
+ *
+ * Return: The next empty bucket, or NULL if the search failed.
+ */
+static struct bucket *
+find_empty_bucket(struct int_map *map, struct bucket *bucket, unsigned int max_probes)
+{
+	/*
+	 * Limit the search to either the nearer of the end of the bucket array or a fixed distance
+	 * beyond the initial bucket.
+	 */
+	ptrdiff_t remaining = &map->buckets[map->bucket_count] - bucket;
+	struct bucket *sentinel = &bucket[min_t(ptrdiff_t, remaining, max_probes)];
+	struct bucket *entry;
+
+	for (entry = bucket; entry < sentinel; entry++)
+		if (entry->value == NULL)
+			return entry;
+	return NULL;
+}
+
+/**
+ * move_empty_bucket() - Move an empty bucket closer to the start of the bucket array.
+ * @map: The map containing the bucket.
+ * @hole: The empty bucket to fill with an entry that precedes it in one of its enclosing
+ *        neighborhoods.
+ *
+ * This searches the neighborhoods that contain the empty bucket for a non-empty bucket closer to
+ * the start of the array. If such a bucket is found, this swaps the two buckets by moving the
+ * entry to the empty bucket.
+ *
+ * Return: The bucket that was vacated by moving its entry to the provided hole, or NULL if no
+ *         entry could be moved.
+ */
+static struct bucket *move_empty_bucket(struct int_map *map __always_unused, struct bucket *hole)
+{
+	/*
+	 * Examine every neighborhood that the empty bucket is part of, starting with the one in
+	 * which it is the last bucket. No boundary check is needed for the negative array
+	 * arithmetic since this function is only called when hole is at least NEIGHBORHOOD cells
+	 * deeper into the array than a valid bucket.
+	 */
+	struct bucket *bucket;
+
+	for (bucket = &hole[1 - NEIGHBORHOOD]; bucket < hole; bucket++) {
+		/*
+		 * Find the entry that is nearest to the bucket, which means it will be nearest to
+		 * the hash bucket whose neighborhood is full.
+		 */
+		struct bucket *new_hole = dereference_hop(bucket, bucket->first_hop);
+
+		if (new_hole == NULL)
+			/*
+			 * There are no buckets in this neighborhood that are in use by this one
+			 * (they must all be owned by overlapping neighborhoods).
+			 */
+			continue;
+
+		/*
+		 * Skip this bucket if its first entry is actually further away than the hole that
+		 * we're already trying to fill.
+		 */
+		if (hole < new_hole)
+			continue;
+
+		/*
+		 * We've found an entry in this neighborhood that we can "hop" further away, moving
+		 * the hole closer to the hash bucket, if not all the way into its neighborhood.
+		 */
+
+		/*
+		 * The entry that will be the new hole is the first bucket in the list, so setting
+		 * first_hop is all that's needed remove it from the list.
+		 */
+		bucket->first_hop = new_hole->next_hop;
+		new_hole->next_hop = NULL_HOP_OFFSET;
+
+		/* Move the entry into the original hole. */
+		hole->key = new_hole->key;
+		hole->value = new_hole->value;
+		new_hole->value = NULL;
+
+		/* Insert the filled hole into the hop list for the neighborhood. */
+		insert_in_hop_list(bucket, hole);
+		return new_hole;
+	}
+
+	/* We couldn't find an entry to relocate to the hole. */
+	return NULL;
+}
+
+/**
+ * update_mapping() - Find and update any existing mapping for a given key, returning the value
+ *                    associated with the key in the provided pointer.
+ * @map: The int_map to attempt to modify.
+ * @neighborhood: The first bucket in the neighborhood that would contain the search key
+ * @key: The key with which to associate the new value.
+ * @new_value: The value to be associated with the key.
+ * @update: Whether to overwrite an existing value.
+ * @old_value_ptr: a pointer in which to store the old value (unmodified if no mapping was found)
+ *
+ * Return: true if the map contains a mapping for the key, false if it does not.
+ */
+static bool update_mapping(struct int_map *map,
+			   struct bucket *neighborhood,
+			   u64 key,
+			   void *new_value,
+			   bool update,
+			   void **old_value_ptr)
+{
+	struct bucket *bucket = search_hop_list(map, neighborhood, key, NULL);
+
+	if (bucket == NULL)
+		/* There is no bucket containing the key in the neighborhood. */
+		return false;
+
+	/*
+	 * Return the value of the current mapping (if desired) and update the mapping with the new
+	 * value (if desired).
+	 */
+	if (old_value_ptr != NULL)
+		*old_value_ptr = bucket->value;
+	if (update)
+		bucket->value = new_value;
+	return true;
+}
+
+/**
+ * find_or_make_vacancy() - Find an empty bucket.
+ * @map: The int_map to search or modify.
+ * @neighborhood: The first bucket in the neighborhood in which an empty bucket is needed for a new
+ *                mapping.
+ *
+ * Find an empty bucket in a specified neighborhood for a new mapping or attempt to re-arrange
+ * mappings so there is such a bucket. This operation may fail (returning NULL) if an empty bucket
+ * is not available or could not be relocated to the neighborhood.
+ *
+ * Return: a pointer to an empty bucket in the desired neighborhood, or NULL if a vacancy could not
+ *         be found or arranged.
+ */
+static struct bucket *find_or_make_vacancy(struct int_map *map, struct bucket *neighborhood)
+{
+	/* Probe within and beyond the neighborhood for the first empty bucket. */
+	struct bucket *hole = find_empty_bucket(map, neighborhood, MAX_PROBES);
+
+	/*
+	 * Keep trying until the empty bucket is in the bucket's neighborhood or we are unable to
+	 * move it any closer by swapping it with a filled bucket.
+	 */
+	while (hole != NULL) {
+		int distance = hole - neighborhood;
+
+		if (distance < NEIGHBORHOOD)
+			/*
+			 * We've found or relocated an empty bucket close enough to the initial
+			 * hash bucket to be referenced by its hop vector.
+			 */
+			return hole;
+
+		/*
+		 * The nearest empty bucket isn't within the neighborhood that must contain the new
+		 * entry, so try to swap it with bucket that is closer.
+		 */
+		hole = move_empty_bucket(map, hole);
+	}
+
+	return NULL;
+}
+
+/**
+ * vdo_int_map_put() - Try to associate a value with an integer.
+ * @map: The int_map to attempt to modify.
+ * @key: The key with which to associate the new value.
+ * @new_value: The value to be associated with the key.
+ * @update: Whether to overwrite an existing value.
+ * @old_value_ptr: A pointer in which to store either the old value (if the key was already mapped)
+ *                 or NULL if the map did not contain the key; NULL may be provided if the caller
+ *                 does not need to know the old value
+ *
+ * Try to associate a value (a pointer) with an integer in an int_map. If the map already contains
+ * a mapping for the provided key, the old value is only replaced with the specified value if
+ * update is true. In either case the old value is returned. If the map does not already contain a
+ * value for the specified key, the new value is added regardless of the value of update.
+ *
+ * Return: UDS_SUCCESS or an error code.
+ */
+int vdo_int_map_put(struct int_map *map, u64 key, void *new_value, bool update, void **old_value_ptr)
+{
+	struct bucket *neighborhood, *bucket;
+
+	if (new_value == NULL)
+		return UDS_INVALID_ARGUMENT;
+
+	/*
+	 * Select the bucket at the start of the neighborhood that must contain any entry for the
+	 * provided key.
+	 */
+	neighborhood = select_bucket(map, key);
+
+	/*
+	 * Check whether the neighborhood already contains an entry for the key, in which case we
+	 * optionally update it, returning the old value.
+	 */
+	if (update_mapping(map, neighborhood, key, new_value, update, old_value_ptr))
+		return UDS_SUCCESS;
+
+	/*
+	 * Find an empty bucket in the desired neighborhood for the new entry or re-arrange entries
+	 * in the map so there is such a bucket. This operation will usually succeed; the loop body
+	 * will only be executed on the rare occasions that we have to resize the map.
+	 */
+	while ((bucket = find_or_make_vacancy(map, neighborhood)) == NULL) {
+		int result;
+
+		/*
+		 * There is no empty bucket in which to put the new entry in the current map, so
+		 * we're forced to allocate a new bucket array with a larger capacity, re-hash all
+		 * the entries into those buckets, and try again (a very expensive operation for
+		 * large maps).
+		 */
+		result = resize_buckets(map);
+		if (result != UDS_SUCCESS)
+			return result;
+
+		/*
+		 * Resizing the map invalidates all pointers to buckets, so recalculate the
+		 * neighborhood pointer.
+		 */
+		neighborhood = select_bucket(map, key);
+	}
+
+	/* Put the new entry in the empty bucket, adding it to the neighborhood. */
+	bucket->key = key;
+	bucket->value = new_value;
+	insert_in_hop_list(neighborhood, bucket);
+	map->size += 1;
+
+	/* There was no existing entry, so there was no old value to be returned. */
+	if (old_value_ptr != NULL)
+		*old_value_ptr = NULL;
+	return UDS_SUCCESS;
+}
+
+/**
+ * vdo_int_map_remove() - Remove the mapping for a given key from the int_map.
+ * @map: The int_map from which to remove the mapping.
+ * @key: The key whose mapping is to be removed.
+ *
+ * Return: the value that was associated with the key, or NULL if it was not mapped.
+ */
+void *vdo_int_map_remove(struct int_map *map, u64 key)
+{
+	void *value;
+
+	/* Select the bucket to search and search it for an existing entry. */
+	struct bucket *bucket = select_bucket(map, key);
+	struct bucket *previous;
+	struct bucket *victim = search_hop_list(map, bucket, key, &previous);
+
+	if (victim == NULL)
+		/* There is no matching entry to remove. */
+		return NULL;
+
+	/*
+	 * We found an entry to remove. Save the mapped value to return later and empty the bucket.
+	 */
+	map->size -= 1;
+	value = victim->value;
+	victim->value = NULL;
+	victim->key = 0;
+
+	/* The victim bucket is now empty, but it still needs to be spliced out of the hop list. */
+	if (previous == NULL)
+		/* The victim is the head of the list, so swing first_hop. */
+		bucket->first_hop = victim->next_hop;
+	else
+		previous->next_hop = victim->next_hop;
+	victim->next_hop = NULL_HOP_OFFSET;
+
+	return value;
+}
diff --git a/drivers/md/dm-vdo/int-map.h b/drivers/md/dm-vdo/int-map.h
new file mode 100644
index 00000000000..cced57c4016
--- /dev/null
+++ b/drivers/md/dm-vdo/int-map.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_INT_MAP_H
+#define VDO_INT_MAP_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+/**
+ * DOC: int_map
+ *
+ * An int_map associates pointers (void *) with integer keys (u64). NULL pointer values are
+ * not supported.
+ *
+ * The map is implemented as hash table, which should provide constant-time insert, query, and
+ * remove operations, although the insert may occasionally grow the table, which is linear in the
+ * number of entries in the map. The table will grow as needed to hold new entries, but will not
+ * shrink as entries are removed.
+ */
+
+struct int_map;
+
+int __must_check
+vdo_make_int_map(size_t initial_capacity, unsigned int initial_load, struct int_map **map_ptr);
+
+void vdo_free_int_map(struct int_map *map);
+
+size_t vdo_int_map_size(const struct int_map *map);
+
+void *vdo_int_map_get(struct int_map *map, u64 key);
+
+int __must_check
+vdo_int_map_put(struct int_map *map, u64 key, void *new_value, bool update, void **old_value_ptr);
+
+void *vdo_int_map_remove(struct int_map *map, u64 key);
+
+#endif /* VDO_INT_MAP_H */
diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c
new file mode 100644
index 00000000000..a77f6c4ec7a
--- /dev/null
+++ b/drivers/md/dm-vdo/io-submitter.c
@@ -0,0 +1,483 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "io-submitter.h"
+
+#include <linux/bio.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "data-vio.h"
+#include "logger.h"
+#include "types.h"
+#include "vdo.h"
+#include "vio.h"
+
+/*
+ * Submission of bio operations to the underlying storage device will go through a separate work
+ * queue thread (or more than one) to prevent blocking in other threads if the storage device has a
+ * full queue. The plug structure allows that thread to do better batching of requests to make the
+ * I/O more efficient.
+ *
+ * When multiple worker threads are used, a thread is chosen for a I/O operation submission based
+ * on the PBN, so a given PBN will consistently wind up on the same thread. Flush operations are
+ * assigned round-robin.
+ *
+ * The map (protected by the mutex) collects pending I/O operations so that the worker thread can
+ * reorder them to try to encourage I/O request merging in the request queue underneath.
+ */
+struct bio_queue_data {
+	struct vdo_work_queue *queue;
+	struct blk_plug plug;
+	struct int_map *map;
+	struct mutex lock;
+	unsigned int queue_number;
+};
+
+struct io_submitter {
+	unsigned int num_bio_queues_used;
+	unsigned int bio_queue_rotation_interval;
+	struct bio_queue_data bio_queue_data[];
+};
+
+static void start_bio_queue(void *ptr)
+{
+	struct bio_queue_data *bio_queue_data = (struct bio_queue_data *) ptr;
+
+	blk_start_plug(&bio_queue_data->plug);
+}
+
+static void finish_bio_queue(void *ptr)
+{
+	struct bio_queue_data *bio_queue_data = (struct bio_queue_data *) ptr;
+
+	blk_finish_plug(&bio_queue_data->plug);
+}
+
+static const struct vdo_work_queue_type bio_queue_type = {
+	.start = start_bio_queue,
+	.finish = finish_bio_queue,
+	.max_priority = BIO_Q_MAX_PRIORITY,
+	.default_priority = BIO_Q_DATA_PRIORITY,
+};
+
+/**
+ * count_all_bios() - Determine which bio counter to use.
+ * @vio: The vio associated with the bio.
+ * @bio: The bio to count.
+ */
+static void count_all_bios(struct vio *vio, struct bio *bio)
+{
+	struct atomic_statistics *stats = &vio->completion.vdo->stats;
+
+	if (is_data_vio(vio)) {
+		vdo_count_bios(&stats->bios_out, bio);
+		return;
+	}
+
+	vdo_count_bios(&stats->bios_meta, bio);
+	if (vio->type == VIO_TYPE_RECOVERY_JOURNAL)
+		vdo_count_bios(&stats->bios_journal, bio);
+	else if (vio->type == VIO_TYPE_BLOCK_MAP)
+		vdo_count_bios(&stats->bios_page_cache, bio);
+}
+
+/**
+ * assert_in_bio_zone() - Assert that a vio is in the correct bio zone and not in interrupt
+ *                        context.
+ * @vio: The vio to check.
+ */
+static void assert_in_bio_zone(struct vio *vio)
+{
+	ASSERT_LOG_ONLY(!in_interrupt(), "not in interrupt context");
+	assert_vio_in_bio_zone(vio);
+}
+
+/**
+ * send_bio_to_device() - Update stats and tracing info, then submit the supplied bio to the OS for
+ *                        processing.
+ * @vio: The vio associated with the bio.
+ * @bio: The bio to submit to the OS.
+ */
+static void send_bio_to_device(struct vio *vio, struct bio *bio)
+{
+	struct vdo *vdo = vio->completion.vdo;
+
+	assert_in_bio_zone(vio);
+	atomic64_inc(&vdo->stats.bios_submitted);
+	count_all_bios(vio, bio);
+	bio_set_dev(bio, vdo_get_backing_device(vdo));
+	submit_bio_noacct(bio);
+}
+
+static sector_t get_bio_sector(struct bio *bio)
+{
+	return bio->bi_iter.bi_sector;
+}
+
+/**
+ * process_vio_io() - Submits a vio's bio to the underlying block device. May block if the device
+ *                    is busy. This callback should be used by vios which did not attempt to merge.
+ */
+void process_vio_io(struct vdo_completion *completion)
+{
+	struct vio *vio = as_vio(completion);
+
+	send_bio_to_device(vio, vio->bio);
+}
+
+/**
+ * get_bio_list() - Extract the list of bios to submit from a vio.
+ * @vio: The vio submitting I/O.
+ *
+ * The list will always contain at least one entry (the bio for the vio on which it is called), but
+ * other bios may have been merged with it as well.
+ *
+ * Return: bio  The head of the bio list to submit.
+ */
+static struct bio *get_bio_list(struct vio *vio)
+{
+	struct bio *bio;
+	struct io_submitter *submitter = vio->completion.vdo->io_submitter;
+	struct bio_queue_data *bio_queue_data = &(submitter->bio_queue_data[vio->bio_zone]);
+
+	assert_in_bio_zone(vio);
+
+	mutex_lock(&bio_queue_data->lock);
+	vdo_int_map_remove(bio_queue_data->map, get_bio_sector(vio->bios_merged.head));
+	vdo_int_map_remove(bio_queue_data->map, get_bio_sector(vio->bios_merged.tail));
+	bio = vio->bios_merged.head;
+	bio_list_init(&vio->bios_merged);
+	mutex_unlock(&bio_queue_data->lock);
+
+	return bio;
+}
+
+/**
+ * process_data_vio_io() - Submit a data_vio's bio to the storage below along with any bios that
+ *                         have been merged with it.
+ *
+ * Context: This call may block and so should only be called from a bio thread.
+ */
+static void process_data_vio_io(struct vdo_completion *completion)
+{
+	struct bio *bio, *next;
+	struct vio *vio = as_vio(completion);
+
+	assert_in_bio_zone(vio);
+	for (bio = get_bio_list(vio); bio != NULL; bio = next) {
+		next = bio->bi_next;
+		bio->bi_next = NULL;
+		send_bio_to_device((struct vio *) bio->bi_private, bio);
+	}
+}
+
+/**
+ * get_mergeable_locked() - Attempt to find an already queued bio that the current bio can be
+ *                          merged with.
+ * @map: The bio map to use for merging.
+ * @vio: The vio we want to merge.
+ * @back_merge: Set to true for a back merge, false for a front merge.
+ *
+ * There are two types of merging possible, forward and backward, which are distinguished by a flag
+ * that uses kernel elevator terminology.
+ *
+ * Return: the vio to merge to, NULL if no merging is possible.
+ */
+static struct vio *get_mergeable_locked(struct int_map *map, struct vio *vio, bool back_merge)
+{
+	struct bio *bio = vio->bio;
+	sector_t merge_sector = get_bio_sector(bio);
+	struct vio *vio_merge;
+
+	if (back_merge)
+		merge_sector -= VDO_SECTORS_PER_BLOCK;
+	else
+		merge_sector += VDO_SECTORS_PER_BLOCK;
+
+	vio_merge = vdo_int_map_get(map, merge_sector);
+
+	if (vio_merge == NULL)
+		return NULL;
+
+	if (vio->completion.priority != vio_merge->completion.priority)
+		return NULL;
+
+	if (bio_data_dir(bio) != bio_data_dir(vio_merge->bio))
+		return NULL;
+
+	if (bio_list_empty(&vio_merge->bios_merged))
+		return NULL;
+
+	if (back_merge)
+		return ((get_bio_sector(vio_merge->bios_merged.tail) == merge_sector) ?
+			vio_merge :
+			NULL);
+
+	return ((get_bio_sector(vio_merge->bios_merged.head) == merge_sector) ? vio_merge : NULL);
+}
+
+static int map_merged_vio(struct int_map *bio_map, struct vio *vio)
+{
+	int result;
+
+	result = vdo_int_map_put(bio_map, get_bio_sector(vio->bios_merged.head), vio, true, NULL);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	return vdo_int_map_put(bio_map, get_bio_sector(vio->bios_merged.tail), vio, true, NULL);
+}
+
+static int merge_to_prev_tail(struct int_map *bio_map, struct vio *vio, struct vio *prev_vio)
+{
+	vdo_int_map_remove(bio_map, get_bio_sector(prev_vio->bios_merged.tail));
+	bio_list_merge(&prev_vio->bios_merged, &vio->bios_merged);
+	return map_merged_vio(bio_map, prev_vio);
+}
+
+static int merge_to_next_head(struct int_map *bio_map, struct vio *vio, struct vio *next_vio)
+{
+	/*
+	 * Handle "next merge" and "gap fill" cases the same way so as to reorder bios in a way
+	 * that's compatible with using funnel queues in work queues. This avoids removing an
+	 * existing completion.
+	 */
+	vdo_int_map_remove(bio_map, get_bio_sector(next_vio->bios_merged.head));
+	bio_list_merge_head(&next_vio->bios_merged, &vio->bios_merged);
+	return map_merged_vio(bio_map, next_vio);
+}
+
+/**
+ * try_bio_map_merge() - Attempt to merge a vio's bio with other pending I/Os.
+ * @vio: The vio to merge.
+ *
+ * Currently this is only used for data_vios, but is broken out for future use with metadata vios.
+ *
+ * Return: whether or not the vio was merged.
+ */
+static bool try_bio_map_merge(struct vio *vio)
+{
+	int result;
+	bool merged = true;
+	struct bio *bio = vio->bio;
+	struct vio *prev_vio, *next_vio;
+	struct vdo *vdo = vio->completion.vdo;
+	struct bio_queue_data *bio_queue_data = &vdo->io_submitter->bio_queue_data[vio->bio_zone];
+
+	bio->bi_next = NULL;
+	bio_list_init(&vio->bios_merged);
+	bio_list_add(&vio->bios_merged, bio);
+
+	mutex_lock(&bio_queue_data->lock);
+	prev_vio = get_mergeable_locked(bio_queue_data->map, vio, true);
+	next_vio = get_mergeable_locked(bio_queue_data->map, vio, false);
+	if (prev_vio == next_vio)
+		next_vio = NULL;
+
+	if ((prev_vio == NULL) && (next_vio == NULL)) {
+		/* no merge. just add to bio_queue */
+		merged = false;
+		result = vdo_int_map_put(bio_queue_data->map,
+					 get_bio_sector(bio),
+					 vio,
+					 true,
+					 NULL);
+	} else if (next_vio == NULL) {
+		/* Only prev. merge to prev's tail */
+		result = merge_to_prev_tail(bio_queue_data->map, vio, prev_vio);
+	} else {
+		/* Only next. merge to next's head */
+		result = merge_to_next_head(bio_queue_data->map, vio, next_vio);
+	}
+
+	mutex_unlock(&bio_queue_data->lock);
+
+	/* We don't care about failure of int_map_put in this case. */
+	ASSERT_LOG_ONLY(result == UDS_SUCCESS, "bio map insertion succeeds");
+	return merged;
+}
+
+/**
+ * submit_data_vio_io() - Submit I/O for a data_vio.
+ * @data_vio: the data_vio for which to issue I/O.
+ *
+ * If possible, this I/O will be merged other pending I/Os. Otherwise, the data_vio will be sent to
+ * the appropriate bio zone directly.
+ */
+void submit_data_vio_io(struct data_vio *data_vio)
+{
+	if (try_bio_map_merge(&data_vio->vio))
+		return;
+
+	launch_data_vio_bio_zone_callback(data_vio, process_data_vio_io);
+}
+
+/**
+ * vdo_submit_metadata_io() - Submit I/O for a metadata vio.
+ * @vio: the vio for which to issue I/O
+ * @physical: the physical block number to read or write
+ * @callback: the bio endio function which will be called after the I/O completes
+ * @error_handler: the handler for submission or I/O errors (may be NULL)
+ * @operation: the type of I/O to perform
+ * @data: the buffer to read or write (may be NULL)
+ *
+ * The vio is enqueued on a vdo bio queue so that bio submission (which may block) does not block
+ * other vdo threads.
+ *
+ * That the error handler will run on the correct thread is only true so long as the thread calling
+ * this function, and the thread set in the endio callback are the same, as well as the fact that
+ * no error can occur on the bio queue. Currently this is true for all callers, but additional care
+ * will be needed if this ever changes.
+ */
+void vdo_submit_metadata_io(struct vio *vio,
+			    physical_block_number_t physical,
+			    bio_end_io_t callback,
+			    vdo_action *error_handler,
+			    unsigned int operation,
+			    char *data)
+{
+	struct vdo_completion *completion = &vio->completion;
+	int result;
+	const struct admin_state_code *code = vdo_get_admin_state(completion->vdo);
+
+
+	ASSERT_LOG_ONLY(!code->quiescent, "I/O not allowed in state %s", code->name);
+	ASSERT_LOG_ONLY(vio->bio->bi_next == NULL, "metadata bio has no next bio");
+
+	vdo_reset_completion(completion);
+	completion->error_handler = error_handler;
+	result = vio_reset_bio(vio, data, callback, operation | REQ_META, physical);
+	if (result != VDO_SUCCESS) {
+		continue_vio(vio, result);
+		return;
+	}
+
+	vdo_set_completion_callback(completion, process_vio_io, get_vio_bio_zone_thread_id(vio));
+	vdo_launch_completion_with_priority(completion, get_metadata_priority(vio));
+}
+
+/**
+ * vdo_make_io_submitter() - Create an io_submitter structure.
+ * @thread_count: Number of bio-submission threads to set up.
+ * @rotation_interval: Interval to use when rotating between bio-submission threads when enqueuing
+ *                     completions.
+ * @max_requests_active: Number of bios for merge tracking.
+ * @vdo: The vdo which will use this submitter.
+ * @io_submitter: pointer to the new data structure.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_make_io_submitter(unsigned int thread_count,
+			  unsigned int rotation_interval,
+			  unsigned int max_requests_active,
+			  struct vdo *vdo,
+			  struct io_submitter **io_submitter_ptr)
+{
+	unsigned int i;
+	struct io_submitter *io_submitter;
+	int result;
+
+	result = UDS_ALLOCATE_EXTENDED(struct io_submitter,
+				       thread_count,
+				       struct bio_queue_data,
+				       "bio submission data",
+				       &io_submitter);
+	if (result != UDS_SUCCESS)
+		return result;
+
+	io_submitter->bio_queue_rotation_interval = rotation_interval;
+
+	/* Setup for each bio-submission work queue */
+	for (i = 0; i < thread_count; i++) {
+		struct bio_queue_data *bio_queue_data = &io_submitter->bio_queue_data[i];
+
+		mutex_init(&bio_queue_data->lock);
+		/*
+		 * One I/O operation per request, but both first & last sector numbers.
+		 *
+		 * If requests are assigned to threads round-robin, they should be distributed
+		 * quite evenly. But if they're assigned based on PBN, things can sometimes be very
+		 * uneven. So for now, we'll assume that all requests *may* wind up on one thread,
+		 * and thus all in the same map.
+		 */
+		result = vdo_make_int_map(max_requests_active * 2, 0, &bio_queue_data->map);
+		if (result != 0) {
+			/*
+			 * Clean up the partially initialized bio-queue entirely and indicate that
+			 * initialization failed.
+			 */
+			uds_log_error("bio map initialization failed %d", result);
+			vdo_cleanup_io_submitter(io_submitter);
+			vdo_free_io_submitter(io_submitter);
+			return result;
+		}
+
+		bio_queue_data->queue_number = i;
+		result = vdo_make_thread(vdo,
+					 vdo->thread_config.bio_threads[i],
+					 &bio_queue_type,
+					 1,
+					 (void **) &bio_queue_data);
+		if (result != VDO_SUCCESS) {
+			/*
+			 * Clean up the partially initialized bio-queue entirely and indicate that
+			 * initialization failed.
+			 */
+			vdo_free_int_map(UDS_FORGET(bio_queue_data->map));
+			uds_log_error("bio queue initialization failed %d", result);
+			vdo_cleanup_io_submitter(io_submitter);
+			vdo_free_io_submitter(io_submitter);
+			return result;
+		}
+
+		bio_queue_data->queue = vdo->threads[vdo->thread_config.bio_threads[i]].queue;
+		io_submitter->num_bio_queues_used++;
+	}
+
+	*io_submitter_ptr = io_submitter;
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_cleanup_io_submitter() - Tear down the io_submitter fields as needed for a physical layer.
+ * @io_submitter: The I/O submitter data to tear down (may be NULL).
+ */
+void vdo_cleanup_io_submitter(struct io_submitter *io_submitter)
+{
+	int i;
+
+	if (io_submitter == NULL)
+		return;
+
+	for (i = io_submitter->num_bio_queues_used - 1; i >= 0; i--)
+		vdo_finish_work_queue(io_submitter->bio_queue_data[i].queue);
+}
+
+/**
+ * vdo_free_io_submitter() - Free the io_submitter fields and structure as needed.
+ * @io_submitter: The I/O submitter data to destroy.
+ *
+ * This must be called after vdo_cleanup_io_submitter(). It is used to release resources late in
+ * the shutdown process to avoid or reduce the chance of race conditions.
+ */
+void vdo_free_io_submitter(struct io_submitter *io_submitter)
+{
+	int i;
+
+	if (io_submitter == NULL)
+		return;
+
+	for (i = io_submitter->num_bio_queues_used - 1; i >= 0; i--) {
+		io_submitter->num_bio_queues_used--;
+		/* vdo_destroy() will free the work queue, so just give up our reference to it. */
+		UDS_FORGET(io_submitter->bio_queue_data[i].queue);
+		vdo_free_int_map(UDS_FORGET(io_submitter->bio_queue_data[i].map));
+	}
+	UDS_FREE(io_submitter);
+}
diff --git a/drivers/md/dm-vdo/io-submitter.h b/drivers/md/dm-vdo/io-submitter.h
new file mode 100644
index 00000000000..5bd05090f67
--- /dev/null
+++ b/drivers/md/dm-vdo/io-submitter.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_IO_SUBMITTER_H
+#define VDO_IO_SUBMITTER_H
+
+#include <linux/bio.h>
+
+#include "types.h"
+
+struct io_submitter;
+
+int vdo_make_io_submitter(unsigned int thread_count,
+			  unsigned int rotation_interval,
+			  unsigned int max_requests_active,
+			  struct vdo *vdo,
+			  struct io_submitter **io_submitter);
+
+void vdo_cleanup_io_submitter(struct io_submitter *io_submitter);
+
+void vdo_free_io_submitter(struct io_submitter *io_submitter);
+
+void process_vio_io(struct vdo_completion *completion);
+
+void submit_data_vio_io(struct data_vio *data_vio);
+
+void vdo_submit_metadata_io(struct vio *vio,
+			    physical_block_number_t physical,
+			    bio_end_io_t callback,
+			    vdo_action *error_handler,
+			    unsigned int operation,
+			    char *data);
+
+static inline void submit_metadata_vio(struct vio *vio,
+				       physical_block_number_t physical,
+				       bio_end_io_t callback,
+				       vdo_action *error_handler,
+				       unsigned int operation)
+{
+	vdo_submit_metadata_io(vio, physical, callback, error_handler, operation, vio->data);
+}
+
+static inline void
+submit_flush_vio(struct vio *vio, bio_end_io_t callback, vdo_action *error_handler)
+{
+	/* FIXME: Can we just use REQ_OP_FLUSH? */
+	vdo_submit_metadata_io(vio, 0, callback, error_handler, REQ_OP_WRITE | REQ_PREFLUSH, NULL);
+}
+
+#endif /* VDO_IO_SUBMITTER_H */
diff --git a/drivers/md/dm-vdo/logical-zone.c b/drivers/md/dm-vdo/logical-zone.c
new file mode 100644
index 00000000000..6fffac8169a
--- /dev/null
+++ b/drivers/md/dm-vdo/logical-zone.c
@@ -0,0 +1,378 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "logical-zone.h"
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+#include "string-utils.h"
+
+#include "action-manager.h"
+#include "admin-state.h"
+#include "block-map.h"
+#include "completion.h"
+#include "constants.h"
+#include "data-vio.h"
+#include "flush.h"
+#include "int-map.h"
+#include "physical-zone.h"
+#include "vdo.h"
+
+enum {
+	ALLOCATIONS_PER_ZONE = 128,
+};
+
+/**
+ * as_logical_zone() - Convert a generic vdo_completion to a logical_zone.
+ * @completion: The completion to convert.
+ *
+ * Return: The completion as a logical_zone.
+ */
+static struct logical_zone *as_logical_zone(struct vdo_completion *completion)
+{
+	vdo_assert_completion_type(completion, VDO_GENERATION_FLUSHED_COMPLETION);
+	return container_of(completion, struct logical_zone, completion);
+}
+
+/* get_thread_id_for_zone() - Implements vdo_zone_thread_getter. */
+static thread_id_t get_thread_id_for_zone(void *context, zone_count_t zone_number)
+{
+	struct logical_zones *zones = context;
+
+	return zones->zones[zone_number].thread_id;
+}
+
+/**
+ * initialize_zone() - Initialize a logical zone.
+ * @zones: The logical_zones to which this zone belongs.
+ * @zone_number: The logical_zone's index.
+ */
+static int initialize_zone(struct logical_zones *zones, zone_count_t zone_number)
+{
+	int result;
+	struct vdo *vdo = zones->vdo;
+	struct logical_zone *zone = &zones->zones[zone_number];
+	zone_count_t allocation_zone_number;
+
+	result = vdo_make_int_map(VDO_LOCK_MAP_CAPACITY, 0, &zone->lbn_operations);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	if (zone_number < vdo->thread_config.logical_zone_count - 1)
+		zone->next = &zones->zones[zone_number + 1];
+
+	vdo_initialize_completion(&zone->completion, vdo, VDO_GENERATION_FLUSHED_COMPLETION);
+	zone->zones = zones;
+	zone->zone_number = zone_number;
+	zone->thread_id = vdo->thread_config.logical_threads[zone_number];
+	zone->block_map_zone = &vdo->block_map->zones[zone_number];
+	INIT_LIST_HEAD(&zone->write_vios);
+	vdo_set_admin_state_code(&zone->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+
+	allocation_zone_number = zone->thread_id % vdo->thread_config.physical_zone_count;
+	zone->allocation_zone = &vdo->physical_zones->zones[allocation_zone_number];
+
+	return vdo_make_default_thread(vdo, zone->thread_id);
+}
+
+/**
+ * vdo_make_logical_zones() - Create a set of logical zones.
+ * @vdo: The vdo to which the zones will belong.
+ * @zones_ptr: A pointer to hold the new zones.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_make_logical_zones(struct vdo *vdo, struct logical_zones **zones_ptr)
+{
+	struct logical_zones *zones;
+	int result;
+	zone_count_t zone;
+	zone_count_t zone_count = vdo->thread_config.logical_zone_count;
+
+	if (zone_count == 0)
+		return VDO_SUCCESS;
+
+	result = UDS_ALLOCATE_EXTENDED(struct logical_zones, zone_count,
+				       struct logical_zone, __func__, &zones);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	zones->vdo = vdo;
+	zones->zone_count = zone_count;
+	for (zone = 0; zone < zone_count; zone++) {
+		result = initialize_zone(zones, zone);
+		if (result != VDO_SUCCESS) {
+			vdo_free_logical_zones(zones);
+			return result;
+		}
+	}
+
+	result = vdo_make_action_manager(zones->zone_count,
+					 get_thread_id_for_zone,
+					 vdo->thread_config.admin_thread,
+					 zones,
+					 NULL,
+					 vdo,
+					 &zones->manager);
+	if (result != VDO_SUCCESS) {
+		vdo_free_logical_zones(zones);
+		return result;
+	}
+
+	*zones_ptr = zones;
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_free_logical_zones() - Free a set of logical zones.
+ * @zones: The set of zones to free.
+ */
+void vdo_free_logical_zones(struct logical_zones *zones)
+{
+	zone_count_t index;
+
+	if (zones == NULL)
+		return;
+
+	UDS_FREE(UDS_FORGET(zones->manager));
+
+	for (index = 0; index < zones->zone_count; index++)
+		vdo_free_int_map(UDS_FORGET(zones->zones[index].lbn_operations));
+
+	UDS_FREE(zones);
+}
+
+static inline void assert_on_zone_thread(struct logical_zone *zone, const char *what)
+{
+	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == zone->thread_id),
+			"%s() called on correct thread", what);
+}
+
+/**
+ * check_for_drain_complete() - Check whether this zone has drained.
+ * @zone: The zone to check.
+ */
+static void check_for_drain_complete(struct logical_zone *zone)
+{
+	if (!vdo_is_state_draining(&zone->state) || zone->notifying ||
+	    !list_empty(&zone->write_vios))
+		return;
+
+	vdo_finish_draining(&zone->state);
+}
+
+/**
+ * initiate_drain() - Initiate a drain.
+ *
+ * Implements vdo_admin_initiator.
+ */
+static void initiate_drain(struct admin_state *state)
+{
+	check_for_drain_complete(container_of(state, struct logical_zone, state));
+}
+
+/**
+ * drain_logical_zone() - Drain a logical zone.
+ *
+ * Implements vdo_zone_action.
+ */
+static void
+drain_logical_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent)
+{
+	struct logical_zones *zones = context;
+
+	vdo_start_draining(&zones->zones[zone_number].state,
+			   vdo_get_current_manager_operation(zones->manager),
+			   parent,
+			   initiate_drain);
+}
+
+void vdo_drain_logical_zones(struct logical_zones *zones,
+			     const struct admin_state_code *operation,
+			     struct vdo_completion *parent)
+{
+	vdo_schedule_operation(zones->manager, operation, NULL, drain_logical_zone, NULL, parent);
+}
+
+/**
+ * resume_logical_zone() - Resume a logical zone.
+ *
+ * Implements vdo_zone_action.
+ */
+static void
+resume_logical_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent)
+{
+	struct logical_zone *zone = &(((struct logical_zones *) context)->zones[zone_number]);
+
+	vdo_fail_completion(parent, vdo_resume_if_quiescent(&zone->state));
+}
+
+/**
+ * vdo_resume_logical_zones() - Resume a set of logical zones.
+ * @zones: The logical zones to resume.
+ * @parent: The object to notify when the zones have resumed.
+ */
+void vdo_resume_logical_zones(struct logical_zones *zones, struct vdo_completion *parent)
+{
+	vdo_schedule_operation(zones->manager, VDO_ADMIN_STATE_RESUMING, NULL,
+			       resume_logical_zone, NULL, parent);
+}
+
+/**
+ * update_oldest_active_generation() - Update the oldest active generation.
+ * @zone: The zone.
+ *
+ * Return: true if the oldest active generation has changed.
+ */
+static bool update_oldest_active_generation(struct logical_zone *zone)
+{
+	struct data_vio *data_vio =
+		list_first_entry_or_null(&zone->write_vios, struct data_vio, write_entry);
+	sequence_number_t oldest =
+		(data_vio == NULL) ? zone->flush_generation : data_vio->flush_generation;
+
+	if (oldest == zone->oldest_active_generation)
+		return false;
+
+	WRITE_ONCE(zone->oldest_active_generation, oldest);
+	return true;
+}
+
+/**
+ * vdo_increment_logical_zone_flush_generation() - Increment the flush generation in a logical
+ *                                                 zone.
+ * @zone: The logical zone.
+ * @expected_generation: The expected value of the flush generation before the increment.
+ */
+void vdo_increment_logical_zone_flush_generation(struct logical_zone *zone,
+						 sequence_number_t expected_generation)
+{
+	assert_on_zone_thread(zone, __func__);
+	ASSERT_LOG_ONLY((zone->flush_generation == expected_generation),
+			"logical zone %u flush generation %llu should be %llu before increment",
+			zone->zone_number,
+			(unsigned long long) zone->flush_generation,
+			(unsigned long long) expected_generation);
+
+	zone->flush_generation++;
+	zone->ios_in_flush_generation = 0;
+	update_oldest_active_generation(zone);
+}
+
+/**
+ * vdo_acquire_flush_generation_lock() - Acquire the shared lock on a flush generation by a write
+ *                                       data_vio.
+ * @data_vio: The data_vio.
+ */
+void vdo_acquire_flush_generation_lock(struct data_vio *data_vio)
+{
+	struct logical_zone *zone = data_vio->logical.zone;
+
+	assert_on_zone_thread(zone, __func__);
+	ASSERT_LOG_ONLY(vdo_is_state_normal(&zone->state), "vdo state is normal");
+
+	data_vio->flush_generation = zone->flush_generation;
+	list_add_tail(&data_vio->write_entry, &zone->write_vios);
+	zone->ios_in_flush_generation++;
+}
+
+static void attempt_generation_complete_notification(struct vdo_completion *completion);
+
+/**
+ * notify_flusher() - Notify the flush that at least one generation no longer has active VIOs.
+ * @completion: The zone completion.
+ *
+ * This callback is registered in attempt_generation_complete_notification().
+ */
+static void notify_flusher(struct vdo_completion *completion)
+{
+	struct logical_zone *zone = as_logical_zone(completion);
+
+	vdo_complete_flushes(zone->zones->vdo->flusher);
+	vdo_launch_completion_callback(completion,
+				       attempt_generation_complete_notification,
+				       zone->thread_id);
+}
+
+/**
+ * void attempt_generation_complete_notification() - Notify the flusher if some generation no
+ *                                                   longer has active VIOs.
+ * @completion: The zone completion.
+ */
+static void attempt_generation_complete_notification(struct vdo_completion *completion)
+{
+	struct logical_zone *zone = as_logical_zone(completion);
+
+	assert_on_zone_thread(zone, __func__);
+	if (zone->oldest_active_generation <= zone->notification_generation) {
+		zone->notifying = false;
+		check_for_drain_complete(zone);
+		return;
+	}
+
+	zone->notifying = true;
+	zone->notification_generation = zone->oldest_active_generation;
+	vdo_launch_completion_callback(&zone->completion, notify_flusher,
+				       vdo_get_flusher_thread_id(zone->zones->vdo->flusher));
+}
+
+/**
+ * vdo_release_flush_generation_lock() - Release the shared lock on a flush generation held by a
+ *                                       write data_vio.
+ * @data_vio: The data_vio whose lock is to be released.
+ *
+ * If there are pending flushes, and this data_vio completes the oldest generation active in this
+ * zone, an attempt will be made to finish any flushes which may now be complete.
+ */
+void vdo_release_flush_generation_lock(struct data_vio *data_vio)
+{
+	struct logical_zone *zone = data_vio->logical.zone;
+
+	assert_on_zone_thread(zone, __func__);
+
+	if (!data_vio_has_flush_generation_lock(data_vio))
+		return;
+
+	list_del_init(&data_vio->write_entry);
+	ASSERT_LOG_ONLY((zone->oldest_active_generation <= data_vio->flush_generation),
+			"data_vio releasing lock on generation %llu is not older than oldest active generation %llu",
+			(unsigned long long) data_vio->flush_generation,
+			(unsigned long long) zone->oldest_active_generation);
+
+	if (!update_oldest_active_generation(zone) || zone->notifying)
+		return;
+
+	attempt_generation_complete_notification(&zone->completion);
+}
+
+struct physical_zone *vdo_get_next_allocation_zone(struct logical_zone *zone)
+{
+	if (zone->allocation_count == ALLOCATIONS_PER_ZONE) {
+		zone->allocation_count = 0;
+		zone->allocation_zone = zone->allocation_zone->next;
+	}
+
+	zone->allocation_count++;
+	return zone->allocation_zone;
+}
+
+/**
+ * vdo_dump_logical_zone() - Dump information about a logical zone to the log for debugging.
+ * @zone: The zone to dump
+ *
+ * Context: the information is dumped in a thread-unsafe fashion.
+ *
+ */
+void vdo_dump_logical_zone(const struct logical_zone *zone)
+{
+	uds_log_info("logical_zone %u", zone->zone_number);
+	uds_log_info("  flush_generation=%llu oldest_active_generation=%llu notification_generation=%llu notifying=%s ios_in_flush_generation=%llu",
+		     (unsigned long long) READ_ONCE(zone->flush_generation),
+		     (unsigned long long) READ_ONCE(zone->oldest_active_generation),
+		     (unsigned long long) READ_ONCE(zone->notification_generation),
+		     uds_bool_to_string(READ_ONCE(zone->notifying)),
+		     (unsigned long long) READ_ONCE(zone->ios_in_flush_generation));
+}
diff --git a/drivers/md/dm-vdo/logical-zone.h b/drivers/md/dm-vdo/logical-zone.h
new file mode 100644
index 00000000000..3da788b0c2f
--- /dev/null
+++ b/drivers/md/dm-vdo/logical-zone.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_LOGICAL_ZONE_H
+#define VDO_LOGICAL_ZONE_H
+
+#include <linux/list.h>
+
+#include "admin-state.h"
+#include "int-map.h"
+#include "types.h"
+
+struct physical_zone;
+
+struct logical_zone {
+	/* The completion for flush notifications */
+	struct vdo_completion completion;
+	/* The owner of this zone */
+	struct logical_zones *zones;
+	/* Which logical zone this is */
+	zone_count_t zone_number;
+	/* The thread id for this zone */
+	thread_id_t thread_id;
+	/* In progress operations keyed by LBN */
+	struct int_map *lbn_operations;
+	/* The logical to physical map */
+	struct block_map_zone *block_map_zone;
+	/* The current flush generation */
+	sequence_number_t flush_generation;
+	/*
+	 * The oldest active generation in this zone. This is mutated only on the logical zone
+	 * thread but is queried from the flusher thread.
+	 */
+	sequence_number_t oldest_active_generation;
+	/* The number of IOs in the current flush generation */
+	block_count_t ios_in_flush_generation;
+	/* The youngest generation of the current notification */
+	sequence_number_t notification_generation;
+	/* Whether a notification is in progress */
+	bool notifying;
+	/* The queue of active data write VIOs */
+	struct list_head write_vios;
+	/* The administrative state of the zone */
+	struct admin_state state;
+	/* The physical zone from which to allocate */
+	struct physical_zone *allocation_zone;
+	/* The number of allocations done from the current allocation_zone */
+	block_count_t allocation_count;
+	/* The next zone */
+	struct logical_zone *next;
+};
+
+struct logical_zones {
+	/* The vdo whose zones these are */
+	struct vdo *vdo;
+	/* The manager for administrative actions */
+	struct action_manager *manager;
+	/* The number of zones */
+	zone_count_t zone_count;
+	/* The logical zones themselves */
+	struct logical_zone zones[];
+};
+
+int __must_check vdo_make_logical_zones(struct vdo *vdo, struct logical_zones **zones_ptr);
+
+void vdo_free_logical_zones(struct logical_zones *zones);
+
+void vdo_drain_logical_zones(struct logical_zones *zones,
+			     const struct admin_state_code *operation,
+			     struct vdo_completion *completion);
+
+void vdo_resume_logical_zones(struct logical_zones *zones, struct vdo_completion *parent);
+
+void vdo_increment_logical_zone_flush_generation(struct logical_zone *zone,
+						 sequence_number_t expected_generation);
+
+void vdo_acquire_flush_generation_lock(struct data_vio *data_vio);
+
+void vdo_release_flush_generation_lock(struct data_vio *data_vio);
+
+struct physical_zone * __must_check vdo_get_next_allocation_zone(struct logical_zone *zone);
+
+void vdo_dump_logical_zone(const struct logical_zone *zone);
+
+#endif /* VDO_LOGICAL_ZONE_H */
diff --git a/drivers/md/dm-vdo/message-stats.c b/drivers/md/dm-vdo/message-stats.c
new file mode 100644
index 00000000000..43a37623b90
--- /dev/null
+++ b/drivers/md/dm-vdo/message-stats.c
@@ -0,0 +1,1222 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "dedupe.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "message-stats.h"
+#include "statistics.h"
+#include "thread-device.h"
+#include "vdo.h"
+
+static int write_u64(char *prefix,
+		     u64 value,
+		     char *suffix,
+		     char **buf,
+		     unsigned int *maxlen)
+{
+	int count = scnprintf(*buf, *maxlen, "%s%llu%s",
+			      prefix == NULL ? "" : prefix,
+			      value,
+			      suffix == NULL ? "" : suffix);
+	*buf += count;
+	*maxlen -= count;
+	if (count >= *maxlen)
+		return VDO_UNEXPECTED_EOF;
+	return VDO_SUCCESS;
+}
+
+static int write_u32(char *prefix,
+		     u32 value,
+		     char *suffix,
+		     char **buf,
+		     unsigned int *maxlen)
+{
+	int count = scnprintf(*buf, *maxlen, "%s%u%s",
+			      prefix == NULL ? "" : prefix,
+			      value,
+			      suffix == NULL ? "" : suffix);
+	*buf += count;
+	*maxlen -= count;
+	if (count >= *maxlen)
+		return VDO_UNEXPECTED_EOF;
+	return VDO_SUCCESS;
+}
+
+static int write_block_count_t(char *prefix,
+			       block_count_t value,
+			       char *suffix,
+			       char **buf,
+			       unsigned int *maxlen)
+{
+	int count = scnprintf(*buf, *maxlen, "%s%llu%s",
+			      prefix == NULL ? "" : prefix,
+			      value,
+			      suffix == NULL ? "" : suffix);
+	*buf += count;
+	*maxlen -= count;
+	if (count >= *maxlen)
+		return VDO_UNEXPECTED_EOF;
+	return VDO_SUCCESS;
+}
+
+static int write_string(char *prefix,
+			char *value,
+			char *suffix,
+			char **buf,
+			unsigned int *maxlen)
+{
+	int count = scnprintf(*buf, *maxlen, "%s%s%s",
+			      prefix == NULL ? "" : prefix,
+			      value,
+			      suffix == NULL ? "" : suffix);
+	*buf += count;
+	*maxlen -= count;
+	if (count >= *maxlen)
+		return VDO_UNEXPECTED_EOF;
+	return VDO_SUCCESS;
+}
+
+static int write_bool(char *prefix,
+		      bool value,
+		      char *suffix,
+		      char **buf,
+		      unsigned int *maxlen)
+{
+	int count = scnprintf(*buf, *maxlen, "%s%d%s",
+			      prefix == NULL ? "" : prefix,
+			      value,
+			      suffix == NULL ? "" : suffix);
+	*buf += count;
+	*maxlen -= count;
+	if (count >= *maxlen)
+		return VDO_UNEXPECTED_EOF;
+	return VDO_SUCCESS;
+}
+
+static int write_u8(char *prefix,
+		    u8 value,
+		    char *suffix,
+		    char **buf,
+		    unsigned int *maxlen)
+{
+	int count = scnprintf(*buf, *maxlen, "%s%u%s",
+			      prefix == NULL ? "" : prefix,
+			      value,
+			      suffix == NULL ? "" : suffix);
+	*buf += count;
+	*maxlen -= count;
+	if (count >= *maxlen)
+		return VDO_UNEXPECTED_EOF;
+	return VDO_SUCCESS;
+}
+
+static int write_block_allocator_statistics(char *prefix,
+					    struct block_allocator_statistics *stats,
+					    char *suffix,
+					    char **buf,
+					    unsigned int *maxlen)
+{
+	int result;
+
+	result = write_string(prefix, "{ ", NULL, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* The total number of slabs from which blocks may be allocated */
+	result = write_u64("slabCount : ",
+			   stats->slab_count,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* The total number of slabs from which blocks have ever been allocated */
+	result = write_u64("slabsOpened : ",
+			   stats->slabs_opened,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* The number of times since loading that a slab has been re-opened */
+	result = write_u64("slabsReopened : ",
+			   stats->slabs_reopened,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_string(NULL, "}", suffix, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	return VDO_SUCCESS;
+}
+
+static int write_commit_statistics(char *prefix,
+				   struct commit_statistics *stats,
+				   char *suffix,
+				   char **buf,
+				   unsigned int *maxlen)
+{
+	int result;
+
+	result = write_string(prefix, "{ ", NULL, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* The total number of items on which processing has started */
+	result = write_u64("started : ",
+			   stats->started,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* The total number of items for which a write operation has been issued */
+	result = write_u64("written : ",
+			   stats->written,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* The total number of items for which a write operation has completed */
+	result = write_u64("committed : ",
+			   stats->committed,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_string(NULL, "}", suffix, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	return VDO_SUCCESS;
+}
+
+static int write_recovery_journal_statistics(char *prefix,
+					     struct recovery_journal_statistics *stats,
+					     char *suffix,
+					     char **buf,
+					     unsigned int *maxlen)
+{
+	int result;
+
+	result = write_string(prefix, "{ ", NULL, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of times the on-disk journal was full */
+	result = write_u64("diskFull : ",
+			   stats->disk_full,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of times the recovery journal requested slab journal commits. */
+	result = write_u64("slabJournalCommitsRequested : ",
+			   stats->slab_journal_commits_requested,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Write/Commit totals for individual journal entries */
+	result = write_commit_statistics("entries : ",
+					 &stats->entries,
+					 ", ",
+					 buf,
+					 maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Write/Commit totals for journal blocks */
+	result = write_commit_statistics("blocks : ",
+					 &stats->blocks,
+					 ", ",
+					 buf,
+					 maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_string(NULL, "}", suffix, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	return VDO_SUCCESS;
+}
+
+static int write_packer_statistics(char *prefix,
+				   struct packer_statistics *stats,
+				   char *suffix,
+				   char **buf,
+				   unsigned int *maxlen)
+{
+	int result;
+
+	result = write_string(prefix, "{ ", NULL, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of compressed data items written since startup */
+	result = write_u64("compressedFragmentsWritten : ",
+			   stats->compressed_fragments_written,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of blocks containing compressed items written since startup */
+	result = write_u64("compressedBlocksWritten : ",
+			   stats->compressed_blocks_written,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of VIOs that are pending in the packer */
+	result = write_u64("compressedFragmentsInPacker : ",
+			   stats->compressed_fragments_in_packer,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_string(NULL, "}", suffix, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	return VDO_SUCCESS;
+}
+
+static int write_slab_journal_statistics(char *prefix,
+					 struct slab_journal_statistics *stats,
+					 char *suffix,
+					 char **buf,
+					 unsigned int *maxlen)
+{
+	int result;
+
+	result = write_string(prefix, "{ ", NULL, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of times the on-disk journal was full */
+	result = write_u64("diskFullCount : ",
+			   stats->disk_full_count,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of times an entry was added over the flush threshold */
+	result = write_u64("flushCount : ",
+			   stats->flush_count,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of times an entry was added over the block threshold */
+	result = write_u64("blockedCount : ",
+			   stats->blocked_count,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of times a tail block was written */
+	result = write_u64("blocksWritten : ",
+			   stats->blocks_written,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of times we had to wait for the tail to write */
+	result = write_u64("tailBusyCount : ",
+			   stats->tail_busy_count,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_string(NULL, "}", suffix, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	return VDO_SUCCESS;
+}
+
+static int write_slab_summary_statistics(char *prefix,
+					 struct slab_summary_statistics *stats,
+					 char *suffix,
+					 char **buf,
+					 unsigned int *maxlen)
+{
+	int result;
+
+	result = write_string(prefix, "{ ", NULL, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of blocks written */
+	result = write_u64("blocksWritten : ",
+			   stats->blocks_written,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_string(NULL, "}", suffix, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	return VDO_SUCCESS;
+}
+
+static int write_ref_counts_statistics(char *prefix,
+				       struct ref_counts_statistics *stats,
+				       char *suffix,
+				       char **buf,
+				       unsigned int *maxlen)
+{
+	int result;
+
+	result = write_string(prefix, "{ ", NULL, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of reference blocks written */
+	result = write_u64("blocksWritten : ",
+			   stats->blocks_written,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_string(NULL, "}", suffix, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	return VDO_SUCCESS;
+}
+
+static int write_block_map_statistics(char *prefix,
+				      struct block_map_statistics *stats,
+				      char *suffix,
+				      char **buf,
+				      unsigned int *maxlen)
+{
+	int result;
+
+	result = write_string(prefix, "{ ", NULL, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of dirty (resident) pages */
+	result = write_u32("dirtyPages : ",
+			   stats->dirty_pages,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of clean (resident) pages */
+	result = write_u32("cleanPages : ",
+			   stats->clean_pages,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of free pages */
+	result = write_u32("freePages : ",
+			   stats->free_pages,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of pages in failed state */
+	result = write_u32("failedPages : ",
+			   stats->failed_pages,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of pages incoming */
+	result = write_u32("incomingPages : ",
+			   stats->incoming_pages,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of pages outgoing */
+	result = write_u32("outgoingPages : ",
+			   stats->outgoing_pages,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* how many times free page not avail */
+	result = write_u32("cachePressure : ",
+			   stats->cache_pressure,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of get_vdo_page() calls for read */
+	result = write_u64("readCount : ",
+			   stats->read_count,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of get_vdo_page() calls for write */
+	result = write_u64("writeCount : ",
+			   stats->write_count,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of times pages failed to read */
+	result = write_u64("failedReads : ",
+			   stats->failed_reads,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of times pages failed to write */
+	result = write_u64("failedWrites : ",
+			   stats->failed_writes,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of gets that are reclaimed */
+	result = write_u64("reclaimed : ",
+			   stats->reclaimed,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of gets for outgoing pages */
+	result = write_u64("readOutgoing : ",
+			   stats->read_outgoing,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of gets that were already there */
+	result = write_u64("foundInCache : ",
+			   stats->found_in_cache,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of gets requiring discard */
+	result = write_u64("discardRequired : ",
+			   stats->discard_required,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of gets enqueued for their page */
+	result = write_u64("waitForPage : ",
+			   stats->wait_for_page,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of gets that have to fetch */
+	result = write_u64("fetchRequired : ",
+			   stats->fetch_required,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of page fetches */
+	result = write_u64("pagesLoaded : ",
+			   stats->pages_loaded,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of page saves */
+	result = write_u64("pagesSaved : ",
+			   stats->pages_saved,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* the number of flushes issued */
+	result = write_u64("flushCount : ",
+			   stats->flush_count,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_string(NULL, "}", suffix, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	return VDO_SUCCESS;
+}
+
+static int write_hash_lock_statistics(char *prefix,
+				      struct hash_lock_statistics *stats,
+				      char *suffix,
+				      char **buf,
+				      unsigned int *maxlen)
+{
+	int result;
+
+	result = write_string(prefix, "{ ", NULL, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of times the UDS advice proved correct */
+	result = write_u64("dedupeAdviceValid : ",
+			   stats->dedupe_advice_valid,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of times the UDS advice proved incorrect */
+	result = write_u64("dedupeAdviceStale : ",
+			   stats->dedupe_advice_stale,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of writes with the same data as another in-flight write */
+	result = write_u64("concurrentDataMatches : ",
+			   stats->concurrent_data_matches,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of writes whose hash collided with an in-flight write */
+	result = write_u64("concurrentHashCollisions : ",
+			   stats->concurrent_hash_collisions,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Current number of dedupe queries that are in flight */
+	result = write_u32("currDedupeQueries : ",
+			   stats->curr_dedupe_queries,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_string(NULL, "}", suffix, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	return VDO_SUCCESS;
+}
+
+static int write_error_statistics(char *prefix,
+				  struct error_statistics *stats,
+				  char *suffix,
+				  char **buf,
+				  unsigned int *maxlen)
+{
+	int result;
+
+	result = write_string(prefix, "{ ", NULL, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of times VDO got an invalid dedupe advice PBN from UDS */
+	result = write_u64("invalidAdvicePBNCount : ",
+			   stats->invalid_advice_pbn_count,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of times a VIO completed with a VDO_NO_SPACE error */
+	result = write_u64("noSpaceErrorCount : ",
+			   stats->no_space_error_count,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of times a VIO completed with a VDO_READ_ONLY error */
+	result = write_u64("readOnlyErrorCount : ",
+			   stats->read_only_error_count,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_string(NULL, "}", suffix, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	return VDO_SUCCESS;
+}
+
+static int write_bio_stats(char *prefix,
+			   struct bio_stats *stats,
+			   char *suffix,
+			   char **buf,
+			   unsigned int *maxlen)
+{
+	int result;
+
+	result = write_string(prefix, "{ ", NULL, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of REQ_OP_READ bios */
+	result = write_u64("read : ",
+			   stats->read,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of REQ_OP_WRITE bios with data */
+	result = write_u64("write : ",
+			   stats->write,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of bios tagged with REQ_PREFLUSH and containing no data */
+	result = write_u64("emptyFlush : ",
+			   stats->empty_flush,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of REQ_OP_DISCARD bios */
+	result = write_u64("discard : ",
+			   stats->discard,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of bios tagged with REQ_PREFLUSH */
+	result = write_u64("flush : ",
+			   stats->flush,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of bios tagged with REQ_FUA */
+	result = write_u64("fua : ",
+			   stats->fua,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_string(NULL, "}", suffix, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	return VDO_SUCCESS;
+}
+
+static int write_memory_usage(char *prefix,
+			      struct memory_usage *stats,
+			      char *suffix,
+			      char **buf,
+			      unsigned int *maxlen)
+{
+	int result;
+
+	result = write_string(prefix, "{ ", NULL, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Tracked bytes currently allocated. */
+	result = write_u64("bytesUsed : ",
+			   stats->bytes_used,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Maximum tracked bytes allocated. */
+	result = write_u64("peakBytesUsed : ",
+			   stats->peak_bytes_used,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_string(NULL, "}", suffix, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	return VDO_SUCCESS;
+}
+
+static int write_index_statistics(char *prefix,
+				  struct index_statistics *stats,
+				  char *suffix,
+				  char **buf,
+				  unsigned int *maxlen)
+{
+	int result;
+
+	result = write_string(prefix, "{ ", NULL, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of records stored in the index */
+	result = write_u64("entriesIndexed : ",
+			   stats->entries_indexed,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of post calls that found an existing entry */
+	result = write_u64("postsFound : ",
+			   stats->posts_found,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of post calls that added a new entry */
+	result = write_u64("postsNotFound : ",
+			   stats->posts_not_found,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of query calls that found an existing entry */
+	result = write_u64("queriesFound : ",
+			   stats->queries_found,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of query calls that added a new entry */
+	result = write_u64("queriesNotFound : ",
+			   stats->queries_not_found,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of update calls that found an existing entry */
+	result = write_u64("updatesFound : ",
+			   stats->updates_found,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of update calls that added a new entry */
+	result = write_u64("updatesNotFound : ",
+			   stats->updates_not_found,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of entries discarded */
+	result = write_u64("entriesDiscarded : ",
+			   stats->entries_discarded,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_string(NULL, "}", suffix, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	return VDO_SUCCESS;
+}
+
+static int write_vdo_statistics(char *prefix,
+				struct vdo_statistics *stats,
+				char *suffix,
+				char **buf,
+				unsigned int *maxlen)
+{
+	int result;
+
+	result = write_string(prefix, "{ ", NULL, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_u32("version : ",
+			   stats->version,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_u32("releaseVersion : ",
+			   stats->release_version,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of blocks used for data */
+	result = write_u64("dataBlocksUsed : ",
+			   stats->data_blocks_used,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of blocks used for VDO metadata */
+	result = write_u64("overheadBlocksUsed : ",
+			   stats->overhead_blocks_used,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of logical blocks that are currently mapped to physical blocks */
+	result = write_u64("logicalBlocksUsed : ",
+			   stats->logical_blocks_used,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of physical blocks */
+	result = write_block_count_t("physicalBlocks : ",
+				     stats->physical_blocks,
+				     ", ",
+				     buf,
+				     maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* number of logical blocks */
+	result = write_block_count_t("logicalBlocks : ",
+				     stats->logical_blocks,
+				     ", ",
+				     buf,
+				     maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Size of the block map page cache, in bytes */
+	result = write_u64("blockMapCacheSize : ",
+			   stats->block_map_cache_size,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* The physical block size */
+	result = write_u64("blockSize : ",
+			   stats->block_size,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of times the VDO has successfully recovered */
+	result = write_u64("completeRecoveries : ",
+			   stats->complete_recoveries,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of times the VDO has recovered from read-only mode */
+	result = write_u64("readOnlyRecoveries : ",
+			   stats->read_only_recoveries,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* String describing the operating mode of the VDO */
+	result = write_string("mode : ",
+			      stats->mode,
+			      ", ",
+			      buf,
+			      maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Whether the VDO is in recovery mode */
+	result = write_bool("inRecoveryMode : ",
+			    stats->in_recovery_mode,
+			    ", ",
+			    buf,
+			    maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* What percentage of recovery mode work has been completed */
+	result = write_u8("recoveryPercentage : ",
+			  stats->recovery_percentage,
+			  ", ",
+			  buf,
+			  maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* The statistics for the compressed block packer */
+	result = write_packer_statistics("packer : ",
+					 &stats->packer,
+					 ", ",
+					 buf,
+					 maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Counters for events in the block allocator */
+	result = write_block_allocator_statistics("allocator : ",
+						  &stats->allocator,
+						  ", ",
+						  buf,
+						  maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Counters for events in the recovery journal */
+	result = write_recovery_journal_statistics("journal : ",
+						   &stats->journal,
+						   ", ",
+						   buf,
+						   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* The statistics for the slab journals */
+	result = write_slab_journal_statistics("slabJournal : ",
+					       &stats->slab_journal,
+					       ", ",
+					       buf,
+					       maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* The statistics for the slab summary */
+	result = write_slab_summary_statistics("slabSummary : ",
+					       &stats->slab_summary,
+					       ", ",
+					       buf,
+					       maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* The statistics for the reference counts */
+	result = write_ref_counts_statistics("refCounts : ",
+					     &stats->ref_counts,
+					     ", ",
+					     buf,
+					     maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* The statistics for the block map */
+	result = write_block_map_statistics("blockMap : ",
+					    &stats->block_map,
+					    ", ",
+					    buf,
+					    maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* The dedupe statistics from hash locks */
+	result = write_hash_lock_statistics("hashLock : ",
+					    &stats->hash_lock,
+					    ", ",
+					    buf,
+					    maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Counts of error conditions */
+	result = write_error_statistics("errors : ",
+					&stats->errors,
+					", ",
+					buf,
+					maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* The VDO instance */
+	result = write_u32("instance : ",
+			   stats->instance,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Current number of active VIOs */
+	result = write_u32("currentVIOsInProgress : ",
+			   stats->current_vios_in_progress,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Maximum number of active VIOs */
+	result = write_u32("maxVIOs : ",
+			   stats->max_vios,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of times the UDS index was too slow in responding */
+	result = write_u64("dedupeAdviceTimeouts : ",
+			   stats->dedupe_advice_timeouts,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Number of flush requests submitted to the storage device */
+	result = write_u64("flushOut : ",
+			   stats->flush_out,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Logical block size */
+	result = write_u64("logicalBlockSize : ",
+			   stats->logical_block_size,
+			   ", ",
+			   buf,
+			   maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Bios submitted into VDO from above */
+	result = write_bio_stats("biosIn : ",
+				 &stats->bios_in,
+				 ", ",
+				 buf,
+				 maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_bio_stats("biosInPartial : ",
+				 &stats->bios_in_partial,
+				 ", ",
+				 buf,
+				 maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Bios submitted onward for user data */
+	result = write_bio_stats("biosOut : ",
+				 &stats->bios_out,
+				 ", ",
+				 buf,
+				 maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Bios submitted onward for metadata */
+	result = write_bio_stats("biosMeta : ",
+				 &stats->bios_meta,
+				 ", ",
+				 buf,
+				 maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_bio_stats("biosJournal : ",
+				 &stats->bios_journal,
+				 ", ",
+				 buf,
+				 maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_bio_stats("biosPageCache : ",
+				 &stats->bios_page_cache,
+				 ", ",
+				 buf,
+				 maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_bio_stats("biosOutCompleted : ",
+				 &stats->bios_out_completed,
+				 ", ",
+				 buf,
+				 maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_bio_stats("biosMetaCompleted : ",
+				 &stats->bios_meta_completed,
+				 ", ",
+				 buf,
+				 maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_bio_stats("biosJournalCompleted : ",
+				 &stats->bios_journal_completed,
+				 ", ",
+				 buf,
+				 maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_bio_stats("biosPageCacheCompleted : ",
+				 &stats->bios_page_cache_completed,
+				 ", ",
+				 buf,
+				 maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_bio_stats("biosAcknowledged : ",
+				 &stats->bios_acknowledged,
+				 ", ",
+				 buf,
+				 maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_bio_stats("biosAcknowledgedPartial : ",
+				 &stats->bios_acknowledged_partial,
+				 ", ",
+				 buf,
+				 maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Current number of bios in progress */
+	result = write_bio_stats("biosInProgress : ",
+				 &stats->bios_in_progress,
+				 ", ",
+				 buf,
+				 maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* Memory usage stats. */
+	result = write_memory_usage("memoryUsage : ",
+				    &stats->memory_usage,
+				    ", ",
+				    buf,
+				    maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	/* The statistics for the UDS index */
+	result = write_index_statistics("index : ",
+					&stats->index,
+					", ",
+					buf,
+					maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	result = write_string(NULL, "}", suffix, buf, maxlen);
+	if (result != VDO_SUCCESS)
+		return result;
+	return VDO_SUCCESS;
+}
+
+int vdo_write_stats(struct vdo *vdo,
+		    char *buf,
+		    unsigned int maxlen)
+{
+	struct vdo_statistics *stats;
+	int result;
+
+	result = UDS_ALLOCATE(1, struct vdo_statistics, __func__, &stats);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	vdo_fetch_statistics(vdo, stats);
+	result = write_vdo_statistics(NULL, stats, NULL, &buf, &maxlen);
+	UDS_FREE(stats);
+	return result;
+}
diff --git a/drivers/md/dm-vdo/message-stats.h b/drivers/md/dm-vdo/message-stats.h
new file mode 100644
index 00000000000..fdcc819ce11
--- /dev/null
+++ b/drivers/md/dm-vdo/message-stats.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_MESSAGE_STATS_H
+#define VDO_MESSAGE_STATS_H
+
+#include "types.h"
+
+int vdo_write_stats(struct vdo *vdo, char *buf, unsigned int maxlen);
+
+#endif /* VDO_MESSAGE_STATS_H */
diff --git a/drivers/md/dm-vdo/packer.c b/drivers/md/dm-vdo/packer.c
new file mode 100644
index 00000000000..f9b67777850
--- /dev/null
+++ b/drivers/md/dm-vdo/packer.c
@@ -0,0 +1,794 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "packer.h"
+
+#include <linux/atomic.h>
+#include <linux/blkdev.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+#include "string-utils.h"
+
+#include "admin-state.h"
+#include "completion.h"
+#include "constants.h"
+#include "data-vio.h"
+#include "dedupe.h"
+#include "encodings.h"
+#include "io-submitter.h"
+#include "physical-zone.h"
+#include "status-codes.h"
+#include "vdo.h"
+#include "vio.h"
+
+static const struct version_number COMPRESSED_BLOCK_1_0 = {
+	.major_version = 1,
+	.minor_version = 0,
+};
+
+enum {
+	COMPRESSED_BLOCK_1_0_SIZE = 4 + 4 + (2 * VDO_MAX_COMPRESSION_SLOTS),
+};
+
+/**
+ * vdo_get_compressed_block_fragment() - Get a reference to a compressed fragment from a compressed
+ *                                       block.
+ * @mapping_state [in] The mapping state for the look up.
+ * @compressed_block [in] The compressed block that was read from disk.
+ * @fragment_offset [out] The offset of the fragment within a compressed block.
+ * @fragment_size [out] The size of the fragment.
+ *
+ * Return: If a valid compressed fragment is found, VDO_SUCCESS; otherwise, VDO_INVALID_FRAGMENT if
+ *         the fragment is invalid.
+ */
+int vdo_get_compressed_block_fragment(enum block_mapping_state mapping_state,
+				      struct compressed_block *block,
+				      u16 *fragment_offset,
+				      u16 *fragment_size)
+{
+	u16 compressed_size;
+	u16 offset = 0;
+	unsigned int i;
+	u8 slot;
+	struct version_number version;
+
+	if (!vdo_is_state_compressed(mapping_state))
+		return VDO_INVALID_FRAGMENT;
+
+	version = vdo_unpack_version_number(block->header.version);
+	if (!vdo_are_same_version(version, COMPRESSED_BLOCK_1_0))
+		return VDO_INVALID_FRAGMENT;
+
+	slot = mapping_state - VDO_MAPPING_STATE_COMPRESSED_BASE;
+	if (slot >= VDO_MAX_COMPRESSION_SLOTS)
+		return VDO_INVALID_FRAGMENT;
+
+	compressed_size = __le16_to_cpu(block->header.sizes[slot]);
+	for (i = 0; i < slot; i++) {
+		offset += __le16_to_cpu(block->header.sizes[i]);
+		if (offset >= VDO_COMPRESSED_BLOCK_DATA_SIZE)
+			return VDO_INVALID_FRAGMENT;
+	}
+
+	if ((offset + compressed_size) > VDO_COMPRESSED_BLOCK_DATA_SIZE)
+		return VDO_INVALID_FRAGMENT;
+
+	*fragment_offset = offset;
+	*fragment_size = compressed_size;
+	return VDO_SUCCESS;
+}
+
+/**
+ * assert_on_packer_thread() - Check that we are on the packer thread.
+ * @packer: The packer.
+ * @caller: The function which is asserting.
+ */
+static inline void assert_on_packer_thread(struct packer *packer, const char *caller)
+{
+	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == packer->thread_id),
+			"%s() called from packer thread", caller);
+}
+
+/**
+ * insert_in_sorted_list() - Insert a bin to the list.
+ * @packer: The packer.
+ * @bin: The bin to move to its sorted position.
+ *
+ * The list is in ascending order of free space. Since all bins are already in the list, this
+ * actually moves the bin to the correct position in the list.
+ */
+static void insert_in_sorted_list(struct packer *packer, struct packer_bin *bin)
+{
+	struct packer_bin *active_bin;
+
+	list_for_each_entry(active_bin, &packer->bins, list)
+		if (active_bin->free_space > bin->free_space) {
+			list_move_tail(&bin->list, &active_bin->list);
+			return;
+		}
+
+	list_move_tail(&bin->list, &packer->bins);
+}
+
+/**
+ * make_bin() - Allocate a bin and put it into the packer's list.
+ * @packer: The packer.
+ */
+static int __must_check make_bin(struct packer *packer)
+{
+	struct packer_bin *bin;
+	int result;
+
+	result = UDS_ALLOCATE_EXTENDED(struct packer_bin,
+				       VDO_MAX_COMPRESSION_SLOTS,
+				       struct vio *,
+				       __func__,
+				       &bin);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	bin->free_space = VDO_COMPRESSED_BLOCK_DATA_SIZE;
+	INIT_LIST_HEAD(&bin->list);
+	list_add_tail(&bin->list, &packer->bins);
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_make_packer() - Make a new block packer.
+ *
+ * @vdo: The vdo to which this packer belongs.
+ * @bin_count: The number of partial bins to keep in memory.
+ * @packer_ptr: A pointer to hold the new packer.
+ *
+ * Return: VDO_SUCCESS or an error
+ */
+int vdo_make_packer(struct vdo *vdo, block_count_t bin_count, struct packer **packer_ptr)
+{
+	struct packer *packer;
+	block_count_t i;
+	int result;
+
+	result = UDS_ALLOCATE(1, struct packer, __func__, &packer);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	packer->thread_id = vdo->thread_config.packer_thread;
+	packer->size = bin_count;
+	INIT_LIST_HEAD(&packer->bins);
+	vdo_set_admin_state_code(&packer->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+
+	for (i = 0; i < bin_count; i++) {
+		result = make_bin(packer);
+		if (result != VDO_SUCCESS) {
+			vdo_free_packer(packer);
+			return result;
+		}
+	}
+
+	/*
+	 * The canceled bin can hold up to half the number of user vios. Every canceled vio in the
+	 * bin must have a canceler for which it is waiting, and any canceler will only have
+	 * canceled one lock holder at a time.
+	 */
+	result = UDS_ALLOCATE_EXTENDED(struct packer_bin,
+				       MAXIMUM_VDO_USER_VIOS / 2,
+				       struct vio *, __func__,
+				       &packer->canceled_bin);
+	if (result != VDO_SUCCESS) {
+		vdo_free_packer(packer);
+		return result;
+	}
+
+	result = vdo_make_default_thread(vdo, packer->thread_id);
+	if (result != VDO_SUCCESS) {
+		vdo_free_packer(packer);
+		return result;
+	}
+
+	*packer_ptr = packer;
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_free_packer() - Free a block packer.
+ * @packer: The packer to free.
+ */
+void vdo_free_packer(struct packer *packer)
+{
+	struct packer_bin *bin, *tmp;
+
+	if (packer == NULL)
+		return;
+
+	list_for_each_entry_safe(bin, tmp, &packer->bins, list) {
+		list_del_init(&bin->list);
+		UDS_FREE(bin);
+	}
+
+	UDS_FREE(UDS_FORGET(packer->canceled_bin));
+	UDS_FREE(packer);
+}
+
+/**
+ * get_packer_from_data_vio() - Get the packer from a data_vio.
+ * @data_vio: The data_vio.
+ *
+ * Return: The packer from the VDO to which the data_vio belongs.
+ */
+static inline struct packer *get_packer_from_data_vio(struct data_vio *data_vio)
+{
+	return vdo_from_data_vio(data_vio)->packer;
+}
+
+/**
+ * vdo_get_packer_statistics() - Get the current statistics from the packer.
+ * @packer: The packer to query.
+ *
+ * Return: a copy of the current statistics for the packer.
+ */
+struct packer_statistics vdo_get_packer_statistics(const struct packer *packer)
+{
+	const struct packer_statistics *stats = &packer->statistics;
+
+	return (struct packer_statistics) {
+		.compressed_fragments_written = READ_ONCE(stats->compressed_fragments_written),
+		.compressed_blocks_written = READ_ONCE(stats->compressed_blocks_written),
+		.compressed_fragments_in_packer = READ_ONCE(stats->compressed_fragments_in_packer),
+	};
+}
+
+/**
+ * abort_packing() - Abort packing a data_vio.
+ * @data_vio: The data_vio to abort.
+ */
+static void abort_packing(struct data_vio *data_vio)
+{
+	struct packer *packer = get_packer_from_data_vio(data_vio);
+
+	WRITE_ONCE(packer->statistics.compressed_fragments_in_packer,
+		   packer->statistics.compressed_fragments_in_packer - 1);
+
+	write_data_vio(data_vio);
+}
+
+/**
+ * release_compressed_write_waiter() - Update a data_vio for which a successful compressed write
+ *                                     has completed and send it on its way.
+
+ * @data_vio: The data_vio to release.
+ * @allocation: The allocation to which the compressed block was written.
+ */
+static void
+release_compressed_write_waiter(struct data_vio *data_vio, struct allocation *allocation)
+{
+	data_vio->new_mapped = (struct zoned_pbn) {
+		.pbn = allocation->pbn,
+		.zone = allocation->zone,
+		.state = data_vio->compression.slot + VDO_MAPPING_STATE_COMPRESSED_BASE,
+	};
+
+	vdo_share_compressed_write_lock(data_vio, allocation->lock);
+	update_metadata_for_data_vio_write(data_vio, allocation->lock);
+}
+
+/**
+ * finish_compressed_write() - Finish a compressed block write.
+ * @completion: The compressed write completion.
+ *
+ * This callback is registered in continue_after_allocation().
+ */
+static void finish_compressed_write(struct vdo_completion *completion)
+{
+	struct data_vio *agent = as_data_vio(completion);
+	struct data_vio *client, *next;
+
+	assert_data_vio_in_allocated_zone(agent);
+
+	/*
+	 * Process all the non-agent waiters first to ensure that the pbn lock can not be released
+	 * until all of them have had a chance to journal their increfs.
+	 */
+	for (client = agent->compression.next_in_batch; client != NULL; client = next) {
+		next = client->compression.next_in_batch;
+		release_compressed_write_waiter(client, &agent->allocation);
+	}
+
+	completion->error_handler = handle_data_vio_error;
+	release_compressed_write_waiter(agent, &agent->allocation);
+}
+
+static void handle_compressed_write_error(struct vdo_completion *completion)
+{
+	struct data_vio *agent = as_data_vio(completion);
+	struct allocation *allocation = &agent->allocation;
+	struct data_vio *client, *next;
+
+	if (vdo_requeue_completion_if_needed(completion, allocation->zone->thread_id))
+		return;
+
+	update_vio_error_stats(as_vio(completion),
+			       "Completing compressed write vio for physical block %llu with error",
+			       (unsigned long long) allocation->pbn);
+
+	for (client = agent->compression.next_in_batch; client != NULL; client = next) {
+		next = client->compression.next_in_batch;
+		write_data_vio(client);
+	}
+
+	/* Now that we've released the batch from the packer, forget the error and continue on. */
+	vdo_reset_completion(completion);
+	completion->error_handler = handle_data_vio_error;
+	write_data_vio(agent);
+}
+
+/**
+ * add_to_bin() - Put a data_vio in a specific packer_bin in which it will definitely fit.
+ * @bin: The bin in which to put the data_vio.
+ * @data_vio: The data_vio to add.
+ */
+static void add_to_bin(struct packer_bin *bin, struct data_vio *data_vio)
+{
+	data_vio->compression.bin = bin;
+	data_vio->compression.slot = bin->slots_used;
+	bin->incoming[bin->slots_used++] = data_vio;
+}
+
+/**
+ * remove_from_bin() - Get the next data_vio whose compression has not been canceled from a bin.
+ * @packer: The packer.
+ * @bin: The bin from which to get a data_vio.
+ *
+ * Any canceled data_vios will be moved to the canceled bin.
+ * Return: An uncanceled data_vio from the bin or NULL if there are none.
+ */
+static struct data_vio *remove_from_bin(struct packer *packer, struct packer_bin *bin)
+{
+	while (bin->slots_used > 0) {
+		struct data_vio *data_vio = bin->incoming[--bin->slots_used];
+
+		if (!advance_data_vio_compression_stage(data_vio).may_not_compress) {
+			data_vio->compression.bin = NULL;
+			return data_vio;
+		}
+
+		add_to_bin(packer->canceled_bin, data_vio);
+	}
+
+	/* The bin is now empty. */
+	bin->free_space = VDO_COMPRESSED_BLOCK_DATA_SIZE;
+	return NULL;
+}
+
+/**
+ * initialize_compressed_block() - Initialize a compressed block.
+ * @block: The compressed block to initialize.
+ * @size: The size of the agent's fragment.
+ *
+ * This method initializes the compressed block in the compressed write agent. Because the
+ * compressor already put the agent's compressed fragment at the start of the compressed block's
+ * data field, it needn't be copied. So all we need do is initialize the header and set the size of
+ * the agent's fragment.
+ */
+static void initialize_compressed_block(struct compressed_block *block, u16 size)
+{
+	/*
+	 * Make sure the block layout isn't accidentally changed by changing the length of the
+	 * block header.
+	 */
+	STATIC_ASSERT_SIZEOF(struct compressed_block_header, COMPRESSED_BLOCK_1_0_SIZE);
+
+	block->header.version = vdo_pack_version_number(COMPRESSED_BLOCK_1_0);
+	block->header.sizes[0] = __cpu_to_le16(size);
+}
+
+/**
+ * pack_fragment() - Pack a data_vio's fragment into the compressed block in which it is already
+ *                   known to fit.
+ * @compression: The agent's compression_state to pack in to.
+ * @data_vio: The data_vio to pack.
+ * @offset: The offset into the compressed block at which to pack the fragment.
+ * @compressed_block: The compressed block which will be written out when batch is fully packed.
+ *
+ * Return: The new amount of space used.
+ */
+static block_size_t __must_check
+pack_fragment(struct compression_state *compression,
+	      struct data_vio *data_vio,
+	      block_size_t offset,
+	      slot_number_t slot,
+	      struct compressed_block *block)
+{
+	struct compression_state *to_pack = &data_vio->compression;
+	char *fragment = to_pack->block->data;
+
+	to_pack->next_in_batch = compression->next_in_batch;
+	compression->next_in_batch = data_vio;
+	to_pack->slot = slot;
+	block->header.sizes[slot] = __cpu_to_le16(to_pack->size);
+	memcpy(&block->data[offset], fragment, to_pack->size);
+	return (offset + to_pack->size);
+}
+
+/**
+ * compressed_write_end_io() - The bio_end_io for a compressed block write.
+ * @bio: The bio for the compressed write.
+ */
+static void compressed_write_end_io(struct bio *bio)
+{
+	struct data_vio *data_vio = vio_as_data_vio(bio->bi_private);
+
+	vdo_count_completed_bios(bio);
+	set_data_vio_allocated_zone_callback(data_vio, finish_compressed_write);
+	continue_data_vio_with_error(data_vio, blk_status_to_errno(bio->bi_status));
+}
+
+/**
+ * write_bin() - Write out a bin.
+ * @packer: The packer.
+ * @bin: The bin to write.
+ */
+static void write_bin(struct packer *packer, struct packer_bin *bin)
+{
+	int result;
+	block_size_t offset;
+	slot_number_t slot = 1;
+	struct compression_state *compression;
+	struct compressed_block *block;
+	struct data_vio *agent = remove_from_bin(packer, bin);
+	struct data_vio *client;
+	struct packer_statistics *stats;
+
+	if (agent == NULL)
+		return;
+
+	compression = &agent->compression;
+	compression->slot = 0;
+	block = compression->block;
+	initialize_compressed_block(block, compression->size);
+	offset = compression->size;
+
+	while ((client = remove_from_bin(packer, bin)) != NULL)
+		offset = pack_fragment(compression, client, offset, slot++, block);
+
+	/*
+	 * If the batch contains only a single vio, then we save nothing by saving the compressed
+	 * form. Continue processing the single vio in the batch.
+	 */
+	if (slot == 1) {
+		abort_packing(agent);
+		return;
+	}
+
+	if (slot < VDO_MAX_COMPRESSION_SLOTS)
+		/* Clear out the sizes of the unused slots */
+		memset(&block->header.sizes[slot],
+		       0,
+		       (VDO_MAX_COMPRESSION_SLOTS - slot) * sizeof(__le16));
+
+	agent->vio.completion.error_handler = handle_compressed_write_error;
+	if (vdo_is_read_only(vdo_from_data_vio(agent))) {
+		continue_data_vio_with_error(agent, VDO_READ_ONLY);
+		return;
+	}
+
+	result = vio_reset_bio(&agent->vio,
+			       (char *) block,
+			       compressed_write_end_io,
+			       REQ_OP_WRITE,
+			       agent->allocation.pbn);
+	if (result != VDO_SUCCESS) {
+		continue_data_vio_with_error(agent, result);
+		return;
+	}
+
+	/*
+	 * Once the compressed write is submitted, the fragments are no longer in the packer, so
+	 * update stats now.
+	 */
+	stats = &packer->statistics;
+	WRITE_ONCE(stats->compressed_fragments_in_packer,
+		   (stats->compressed_fragments_in_packer - slot));
+	WRITE_ONCE(stats->compressed_fragments_written,
+		   (stats->compressed_fragments_written + slot));
+	WRITE_ONCE(stats->compressed_blocks_written, stats->compressed_blocks_written + 1);
+
+	submit_data_vio_io(agent);
+}
+
+/**
+ * add_data_vio_to_packer_bin() - Add a data_vio to a bin's incoming queue
+ * @packer: The packer.
+ * @bin: The bin to which to add the data_vio.
+ * @data_vio: The data_vio to add to the bin's queue.
+ *
+ * Adds a data_vio to a bin's incoming queue, handles logical space change, and calls physical
+ * space processor.
+ */
+static void add_data_vio_to_packer_bin(struct packer *packer,
+				       struct packer_bin *bin,
+				       struct data_vio *data_vio)
+{
+	/* If the selected bin doesn't have room, start a new batch to make room. */
+	if (bin->free_space < data_vio->compression.size)
+		write_bin(packer, bin);
+
+	add_to_bin(bin, data_vio);
+	bin->free_space -= data_vio->compression.size;
+
+	/* If we happen to exactly fill the bin, start a new batch. */
+	if ((bin->slots_used == VDO_MAX_COMPRESSION_SLOTS) ||
+	    (bin->free_space == 0))
+		write_bin(packer, bin);
+
+	/* Now that we've finished changing the free space, restore the sort order. */
+	insert_in_sorted_list(packer, bin);
+}
+
+/**
+ * select_bin() - Select the bin that should be used to pack the compressed data in a data_vio with
+ *                other data_vios.
+ * @packer: The packer.
+ * @data_vio: The data_vio.
+ */
+static struct packer_bin * __must_check
+select_bin(struct packer *packer, struct data_vio *data_vio)
+{
+	/*
+	 * First best fit: select the bin with the least free space that has enough room for the
+	 * compressed data in the data_vio.
+	 */
+	struct packer_bin *bin, *fullest_bin;
+
+	list_for_each_entry(bin, &packer->bins, list)
+		if (bin->free_space >= data_vio->compression.size)
+			return bin;
+
+	/*
+	 * None of the bins have enough space for the data_vio. We're not allowed to create new
+	 * bins, so we have to overflow one of the existing bins. It's pretty intuitive to select
+	 * the fullest bin, since that "wastes" the least amount of free space in the compressed
+	 * block. But if the space currently used in the fullest bin is smaller than the compressed
+	 * size of the incoming block, it seems wrong to force that bin to write when giving up on
+	 * compressing the incoming data_vio would likewise "waste" the least amount of free space.
+	 */
+	fullest_bin = list_first_entry(&packer->bins, struct packer_bin, list);
+	if (data_vio->compression.size >=
+	    (VDO_COMPRESSED_BLOCK_DATA_SIZE - fullest_bin->free_space))
+		return NULL;
+
+	/*
+	 * The fullest bin doesn't have room, but writing it out and starting a new batch with the
+	 * incoming data_vio will increase the packer's free space.
+	 */
+	return fullest_bin;
+}
+
+/**
+ * vdo_attempt_packing() - Attempt to rewrite the data in this data_vio as part of a compressed
+ *                         block.
+ * @data_vio: The data_vio to pack.
+ */
+void vdo_attempt_packing(struct data_vio *data_vio)
+{
+	int result;
+	struct packer_bin *bin;
+	struct data_vio_compression_status status = get_data_vio_compression_status(data_vio);
+	struct packer *packer = get_packer_from_data_vio(data_vio);
+
+	assert_on_packer_thread(packer, __func__);
+
+	result = ASSERT((status.stage == DATA_VIO_COMPRESSING),
+			"attempt to pack data_vio not ready for packing, stage: %u",
+			status.stage);
+	if (result != VDO_SUCCESS)
+		return;
+
+	/*
+	 * Increment whether or not this data_vio will be packed or not since abort_packing()
+	 * always decrements the counter.
+	 */
+	WRITE_ONCE(packer->statistics.compressed_fragments_in_packer,
+		   packer->statistics.compressed_fragments_in_packer + 1);
+
+	/*
+	 * If packing of this data_vio is disallowed for administrative reasons, give up before
+	 * making any state changes.
+	 */
+	if (!vdo_is_state_normal(&packer->state) ||
+	    (data_vio->flush_generation < packer->flush_generation)) {
+		abort_packing(data_vio);
+		return;
+	}
+
+	/*
+	 * The check of may_vio_block_in_packer() here will set the data_vio's compression state to
+	 * VIO_PACKING if the data_vio is allowed to be compressed (if it has already been
+	 * canceled, we'll fall out here). Once the data_vio is in the VIO_PACKING state, it must
+	 * be guaranteed to be put in a bin before any more requests can be processed by the packer
+	 * thread. Otherwise, a canceling data_vio could attempt to remove the canceled data_vio
+	 * from the packer and fail to rendezvous with it (VDO-2809). We must also make sure that
+	 * we will actually bin the data_vio and not give up on it as being larger than the space
+	 * used in the fullest bin. Hence we must call select_bin() before calling
+	 * may_vio_block_in_packer() (VDO-2826).
+	 */
+	bin = select_bin(packer, data_vio);
+	if ((bin == NULL) ||
+	    (advance_data_vio_compression_stage(data_vio).stage != DATA_VIO_PACKING)) {
+		abort_packing(data_vio);
+		return;
+	}
+
+	add_data_vio_to_packer_bin(packer, bin, data_vio);
+}
+
+/**
+ * check_for_drain_complete() - Check whether the packer has drained.
+ * @packer: The packer.
+ */
+static void check_for_drain_complete(struct packer *packer)
+{
+	if (vdo_is_state_draining(&packer->state) && (packer->canceled_bin->slots_used == 0))
+		vdo_finish_draining(&packer->state);
+}
+
+/**
+ * write_all_non_empty_bins() - Write out all non-empty bins on behalf of a flush or suspend.
+ * @packer: The packer being flushed.
+ */
+static void write_all_non_empty_bins(struct packer *packer)
+{
+	struct packer_bin *bin;
+
+	list_for_each_entry(bin, &packer->bins, list)
+		write_bin(packer, bin);
+		/*
+		 * We don't need to re-sort the bin here since this loop will make every bin have
+		 * the same amount of free space, so every ordering is sorted.
+		 */
+
+	check_for_drain_complete(packer);
+}
+
+/**
+ * vdo_flush_packer() - Request that the packer flush asynchronously.
+ * @packer: The packer to flush.
+ *
+ * All bins with at least two compressed data blocks will be written out, and any solitary pending
+ * VIOs will be released from the packer. While flushing is in progress, any VIOs submitted to
+ * vdo_attempt_packing() will be continued immediately without attempting to pack them.
+ */
+void vdo_flush_packer(struct packer *packer)
+{
+	assert_on_packer_thread(packer, __func__);
+	if (vdo_is_state_normal(&packer->state))
+		write_all_non_empty_bins(packer);
+}
+
+/**
+ * vdo_remove_lock_holder_from_packer() - Remove a lock holder from the packer.
+ * @completion: The data_vio which needs a lock held by a data_vio in the packer. The data_vio's
+ *              compression.lock_holder field will point to the data_vio to remove.
+ */
+void vdo_remove_lock_holder_from_packer(struct vdo_completion *completion)
+{
+	struct data_vio *data_vio = as_data_vio(completion);
+	struct packer *packer = get_packer_from_data_vio(data_vio);
+	struct data_vio *lock_holder;
+	struct packer_bin *bin;
+	slot_number_t slot;
+
+	assert_data_vio_in_packer_zone(data_vio);
+
+	lock_holder = UDS_FORGET(data_vio->compression.lock_holder);
+	bin = lock_holder->compression.bin;
+	ASSERT_LOG_ONLY((bin != NULL), "data_vio in packer has a bin");
+
+	slot = lock_holder->compression.slot;
+	bin->slots_used--;
+	if (slot < bin->slots_used) {
+		bin->incoming[slot] = bin->incoming[bin->slots_used];
+		bin->incoming[slot]->compression.slot = slot;
+	}
+
+	lock_holder->compression.bin = NULL;
+	lock_holder->compression.slot = 0;
+
+	if (bin != packer->canceled_bin) {
+		bin->free_space += lock_holder->compression.size;
+		insert_in_sorted_list(packer, bin);
+	}
+
+	abort_packing(lock_holder);
+	check_for_drain_complete(packer);
+}
+
+/**
+ * vdo_increment_packer_flush_generation() - Increment the flush generation in the packer.
+ * @packer: The packer.
+ *
+ * This will also cause the packer to flush so that any VIOs from previous generations will exit
+ * the packer.
+ */
+void vdo_increment_packer_flush_generation(struct packer *packer)
+{
+	assert_on_packer_thread(packer, __func__);
+	packer->flush_generation++;
+	vdo_flush_packer(packer);
+}
+
+/**
+ * initiate_drain() - Initiate a drain.
+ *
+ * Implements vdo_admin_initiator.
+ */
+static void initiate_drain(struct admin_state *state)
+{
+	struct packer *packer = container_of(state, struct packer, state);
+
+	write_all_non_empty_bins(packer);
+}
+
+/**
+ * vdo_drain_packer() - Drain the packer by preventing any more VIOs from entering the packer and
+ *                      then flushing.
+ * @packer: The packer to drain.
+ * @completion: The completion to finish when the packer has drained.
+ */
+void vdo_drain_packer(struct packer *packer, struct vdo_completion *completion)
+{
+	assert_on_packer_thread(packer, __func__);
+	vdo_start_draining(&packer->state, VDO_ADMIN_STATE_SUSPENDING, completion, initiate_drain);
+}
+
+/**
+ * vdo_resume_packer() - Resume a packer which has been suspended.
+ * @packer: The packer to resume.
+ * @parent: The completion to finish when the packer has resumed.
+ */
+void vdo_resume_packer(struct packer *packer, struct vdo_completion *parent)
+{
+	assert_on_packer_thread(packer, __func__);
+	vdo_continue_completion(parent, vdo_resume_if_quiescent(&packer->state));
+}
+
+static void dump_packer_bin(const struct packer_bin *bin, bool canceled)
+{
+	if (bin->slots_used == 0)
+		/* Don't dump empty bins. */
+		return;
+
+	uds_log_info("	  %sBin slots_used=%u free_space=%zu",
+		     (canceled ? "Canceled" : ""), bin->slots_used,
+		     bin->free_space);
+
+	/*
+	 * FIXME: dump vios in bin->incoming? The vios should have been dumped from the vio pool.
+	 * Maybe just dump their addresses so it's clear they're here?
+	 */
+}
+
+/**
+ * vdo_dump_packer() - Dump the packer.
+ * @packer: The packer.
+ *
+ * Context: dumps in a thread-unsafe fashion.
+ */
+void vdo_dump_packer(const struct packer *packer)
+{
+	struct packer_bin *bin;
+
+	uds_log_info("packer");
+	uds_log_info("	flushGeneration=%llu state %s  packer_bin_count=%llu",
+		     (unsigned long long) packer->flush_generation,
+		     vdo_get_admin_state_code(&packer->state)->name,
+		     (unsigned long long) packer->size);
+
+	list_for_each_entry(bin, &packer->bins, list)
+		dump_packer_bin(bin, false);
+
+	dump_packer_bin(packer->canceled_bin, true);
+}
diff --git a/drivers/md/dm-vdo/packer.h b/drivers/md/dm-vdo/packer.h
new file mode 100644
index 00000000000..8eed435cf9f
--- /dev/null
+++ b/drivers/md/dm-vdo/packer.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_PACKER_H
+#define VDO_PACKER_H
+
+#include <linux/list.h>
+
+#include "admin-state.h"
+#include "constants.h"
+#include "encodings.h"
+#include "statistics.h"
+#include "types.h"
+#include "wait-queue.h"
+
+enum {
+	DEFAULT_PACKER_BINS = 16,
+};
+
+/* The header of a compressed block. */
+struct compressed_block_header {
+	/* Unsigned 32-bit major and minor versions, little-endian */
+	struct packed_version_number version;
+
+	/* List of unsigned 16-bit compressed block sizes, little-endian */
+	__le16 sizes[VDO_MAX_COMPRESSION_SLOTS];
+} __packed;
+
+enum {
+	VDO_COMPRESSED_BLOCK_DATA_SIZE = VDO_BLOCK_SIZE - sizeof(struct compressed_block_header),
+
+	/*
+	 * A compressed block is only written if we can pack at least two fragments into it, so a
+	 * fragment which fills the entire data portion of a compressed block is too big.
+	 */
+	VDO_MAX_COMPRESSED_FRAGMENT_SIZE = VDO_COMPRESSED_BLOCK_DATA_SIZE - 1,
+};
+
+/* * The compressed block overlay. */
+struct compressed_block {
+	struct compressed_block_header header;
+	char data[VDO_COMPRESSED_BLOCK_DATA_SIZE];
+} __packed;
+
+/*
+ * Each packer_bin holds an incomplete batch of data_vios that only partially fill a compressed
+ * block. The bins are kept in a ring sorted by the amount of unused space so the first bin with
+ * enough space to hold a newly-compressed data_vio can easily be found. When the bin fills up or
+ * is flushed, the first uncanceled data_vio in the bin is selected to be the agent for that bin.
+ * Upon entering the packer, each data_vio already has its compressed data in the first slot of the
+ * data_vio's compressed_block (overlaid on the data_vio's scratch_block). So the agent's fragment
+ * is already in place. The fragments for the other uncanceled data_vios in the bin are packed into
+ * the agent's compressed block. The agent then writes out the compressed block. If the write is
+ * successful, the agent shares its pbn lock which each of the other data_vios in its compressed
+ * block and sends each on its way. Finally the agent itself continues on the write path as before.
+ *
+ * There is one special bin which is used to hold data_vios which have been canceled and removed
+ * from their bin by the packer. These data_vios need to wait for the canceller to rendezvous with
+ * them (VDO-2809) and so they sit in this special bin.
+ */
+struct packer_bin {
+	/* List links for packer.packer_bins */
+	struct list_head list;
+	/* The number of items in the bin */
+	slot_number_t slots_used;
+	/* The number of compressed block bytes remaining in the current batch */
+	size_t free_space;
+	/* The current partial batch of data_vios, waiting for more */
+	struct data_vio *incoming[];
+};
+
+struct packer {
+	/* The ID of the packer's callback thread */
+	thread_id_t thread_id;
+	/* The number of bins */
+	block_count_t size;
+	/* A list of all packer_bins, kept sorted by free_space */
+	struct list_head bins;
+	/*
+	 * A bin to hold data_vios which were canceled out of the packer and are waiting to
+	 * rendezvous with the canceling data_vio.
+	 */
+	struct packer_bin *canceled_bin;
+
+	/* The current flush generation */
+	sequence_number_t flush_generation;
+
+	/* The administrative state of the packer */
+	struct admin_state state;
+
+	/* Statistics are only updated on the packer thread, but are accessed from other threads */
+	struct packer_statistics statistics;
+};
+
+int vdo_get_compressed_block_fragment(enum block_mapping_state mapping_state,
+				      struct compressed_block *block,
+				      u16 *fragment_offset,
+				      u16 *fragment_size);
+
+int __must_check
+vdo_make_packer(struct vdo *vdo, block_count_t bin_count, struct packer **packer_ptr);
+
+void vdo_free_packer(struct packer *packer);
+
+struct packer_statistics __must_check vdo_get_packer_statistics(const struct packer *packer);
+
+void vdo_attempt_packing(struct data_vio *data_vio);
+
+void vdo_flush_packer(struct packer *packer);
+
+void vdo_remove_lock_holder_from_packer(struct vdo_completion *completion);
+
+void vdo_increment_packer_flush_generation(struct packer *packer);
+
+void vdo_drain_packer(struct packer *packer, struct vdo_completion *completion);
+
+void vdo_resume_packer(struct packer *packer, struct vdo_completion *parent);
+
+void vdo_dump_packer(const struct packer *packer);
+
+#endif /* VDO_PACKER_H */
diff --git a/drivers/md/dm-vdo/physical-zone.c b/drivers/md/dm-vdo/physical-zone.c
new file mode 100644
index 00000000000..f07718a662d
--- /dev/null
+++ b/drivers/md/dm-vdo/physical-zone.c
@@ -0,0 +1,650 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "physical-zone.h"
+
+#include <linux/list.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "block-map.h"
+#include "completion.h"
+#include "constants.h"
+#include "data-vio.h"
+#include "dedupe.h"
+#include "encodings.h"
+#include "flush.h"
+#include "int-map.h"
+#include "slab-depot.h"
+#include "status-codes.h"
+#include "vdo.h"
+
+enum {
+	/* Each user data_vio needs a PBN read lock and write lock. */
+	LOCK_POOL_CAPACITY = 2 * MAXIMUM_VDO_USER_VIOS,
+};
+
+struct pbn_lock_implementation {
+	enum pbn_lock_type type;
+	const char *name;
+	const char *release_reason;
+};
+
+/* This array must have an entry for every pbn_lock_type value. */
+static const struct pbn_lock_implementation LOCK_IMPLEMENTATIONS[] = {
+	[VIO_READ_LOCK] = {
+		.type = VIO_READ_LOCK,
+		.name = "read",
+		.release_reason = "candidate duplicate",
+	},
+	[VIO_WRITE_LOCK] = {
+		.type = VIO_WRITE_LOCK,
+		.name = "write",
+		.release_reason = "newly allocated",
+	},
+	[VIO_BLOCK_MAP_WRITE_LOCK] = {
+		.type = VIO_BLOCK_MAP_WRITE_LOCK,
+		.name = "block map write",
+		.release_reason = "block map write",
+	},
+};
+
+static inline bool has_lock_type(const struct pbn_lock *lock, enum pbn_lock_type type)
+{
+	return (lock->implementation == &LOCK_IMPLEMENTATIONS[type]);
+}
+
+/**
+ * vdo_is_pbn_read_lock() - Check whether a pbn_lock is a read lock.
+ * @lock: The lock to check.
+ *
+ * Return: true if the lock is a read lock.
+ */
+bool vdo_is_pbn_read_lock(const struct pbn_lock *lock)
+{
+	return has_lock_type(lock, VIO_READ_LOCK);
+}
+
+static inline void set_pbn_lock_type(struct pbn_lock *lock, enum pbn_lock_type type)
+{
+	lock->implementation = &LOCK_IMPLEMENTATIONS[type];
+}
+
+/**
+ * vdo_downgrade_pbn_write_lock() - Downgrade a PBN write lock to a PBN read lock.
+ * @lock: The PBN write lock to downgrade.
+ *
+ * The lock holder count is cleared and the caller is responsible for setting the new count.
+ */
+void vdo_downgrade_pbn_write_lock(struct pbn_lock *lock, bool compressed_write)
+{
+	ASSERT_LOG_ONLY(!vdo_is_pbn_read_lock(lock),
+			"PBN lock must not already have been downgraded");
+	ASSERT_LOG_ONLY(!has_lock_type(lock, VIO_BLOCK_MAP_WRITE_LOCK),
+			"must not downgrade block map write locks");
+	ASSERT_LOG_ONLY(lock->holder_count == 1,
+			"PBN write lock should have one holder but has %u",
+			lock->holder_count);
+	/*
+	 * data_vio write locks are downgraded in place--the writer retains the hold on the lock.
+	 * If this was a compressed write, the holder has not yet journaled its own inc ref,
+	 * otherwise, it has.
+	 */
+	lock->increment_limit =
+		(compressed_write ? MAXIMUM_REFERENCE_COUNT : MAXIMUM_REFERENCE_COUNT - 1);
+	set_pbn_lock_type(lock, VIO_READ_LOCK);
+}
+
+/**
+ * vdo_claim_pbn_lock_increment() - Try to claim one of the available reference count increments on
+ *				    a read lock.
+ * @lock: The PBN read lock from which to claim an increment.
+ *
+ * Claims may be attempted from any thread. A claim is only valid until the PBN lock is released.
+ *
+ * Return: true if the claim succeeded, guaranteeing one increment can be made without overflowing
+ *	   the PBN's reference count.
+ */
+bool vdo_claim_pbn_lock_increment(struct pbn_lock *lock)
+{
+	/*
+	 * Claim the next free reference atomically since hash locks from multiple hash zone
+	 * threads might be concurrently deduplicating against a single PBN lock on compressed
+	 * block. As long as hitting the increment limit will lead to the PBN lock being released
+	 * in a sane time-frame, we won't overflow a 32-bit claim counter, allowing a simple add
+	 * instead of a compare-and-swap.
+	 */
+	u32 claim_number = (u32) atomic_add_return(1, &lock->increments_claimed);
+
+	return (claim_number <= lock->increment_limit);
+}
+
+/**
+ * vdo_assign_pbn_lock_provisional_reference() - Inform a PBN lock that it is responsible for a
+ *						 provisional reference.
+ * @lock: The PBN lock.
+ */
+void vdo_assign_pbn_lock_provisional_reference(struct pbn_lock *lock)
+{
+	ASSERT_LOG_ONLY(!lock->has_provisional_reference,
+			"lock does not have a provisional reference");
+	lock->has_provisional_reference = true;
+}
+
+/**
+ * vdo_unassign_pbn_lock_provisional_reference() - Inform a PBN lock that it is no longer
+ *						   responsible for a provisional reference.
+ * @lock: The PBN lock.
+ */
+void vdo_unassign_pbn_lock_provisional_reference(struct pbn_lock *lock)
+{
+	lock->has_provisional_reference = false;
+}
+
+/**
+ * release_pbn_lock_provisional_reference() - If the lock is responsible for a provisional
+ *					      reference, release that reference.
+ * @lock: The lock.
+ * @locked_pbn: The PBN covered by the lock.
+ * @allocator: The block allocator from which to release the reference.
+ *
+ * This method is called when the lock is released.
+ */
+static void
+release_pbn_lock_provisional_reference(struct pbn_lock *lock,
+				       physical_block_number_t locked_pbn,
+				       struct block_allocator *allocator)
+{
+	int result;
+
+	if (!vdo_pbn_lock_has_provisional_reference(lock))
+		return;
+
+	result = vdo_release_block_reference(allocator, locked_pbn);
+	if (result != VDO_SUCCESS)
+		uds_log_error_strerror(result,
+				       "Failed to release reference to %s physical block %llu",
+				       lock->implementation->release_reason,
+				       (unsigned long long) locked_pbn);
+
+	vdo_unassign_pbn_lock_provisional_reference(lock);
+}
+
+/**
+ * union idle_pbn_lock - PBN lock list entries.
+ *
+ * Unused (idle) PBN locks are kept in a list. Just like in a malloc implementation, the lock
+ * structure is unused memory, so we can save a bit of space (and not pollute the lock structure
+ * proper) by using a union to overlay the lock structure with the free list.
+ */
+typedef union {
+	/** @entry: Only used while locks are in the pool. */
+	struct list_head entry;
+	/** @lock: Only used while locks are not in the pool. */
+	struct pbn_lock lock;
+} idle_pbn_lock;
+
+/**
+ * struct pbn_lock_pool - list of PBN locks.
+ *
+ * The lock pool is little more than the memory allocated for the locks.
+ */
+struct pbn_lock_pool {
+	/** @capacity: The number of locks allocated for the pool. */
+	size_t capacity;
+	/** @borrowed: The number of locks currently borrowed from the pool. */
+	size_t borrowed;
+	/** @idle_list: A list containing all idle PBN lock instances. */
+	struct list_head idle_list;
+	/** @locks: The memory for all the locks allocated by this pool. */
+	idle_pbn_lock locks[];
+};
+
+/**
+ * return_pbn_lock_to_pool() - Return a pbn lock to its pool.
+ * @pool: The pool from which the lock was borrowed.
+ * @lock: The last reference to the lock being returned.
+ *
+ * It must be the last live reference, as if the memory were being freed (the lock memory will
+ * re-initialized or zeroed).
+ */
+static void return_pbn_lock_to_pool(struct pbn_lock_pool *pool, struct pbn_lock *lock)
+{
+	idle_pbn_lock *idle;
+
+	/* A bit expensive, but will promptly catch some use-after-free errors. */
+	memset(lock, 0, sizeof(*lock));
+
+	idle = container_of(lock, idle_pbn_lock, lock);
+	INIT_LIST_HEAD(&idle->entry);
+	list_add_tail(&idle->entry, &pool->idle_list);
+
+	ASSERT_LOG_ONLY(pool->borrowed > 0, "shouldn't return more than borrowed");
+	pool->borrowed -= 1;
+}
+
+/**
+ * make_pbn_lock_pool() - Create a new PBN lock pool and all the lock instances it can loan out.
+ *
+ * @capacity: The number of PBN locks to allocate for the pool.
+ * @pool_ptr: A pointer to receive the new pool.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int make_pbn_lock_pool(size_t capacity, struct pbn_lock_pool **pool_ptr)
+{
+	size_t i;
+	struct pbn_lock_pool *pool;
+	int result;
+
+	result = UDS_ALLOCATE_EXTENDED(struct pbn_lock_pool,
+				       capacity,
+				       idle_pbn_lock,
+				       __func__,
+				       &pool);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	pool->capacity = capacity;
+	pool->borrowed = capacity;
+	INIT_LIST_HEAD(&pool->idle_list);
+
+	for (i = 0; i < capacity; i++)
+		return_pbn_lock_to_pool(pool, &pool->locks[i].lock);
+
+	*pool_ptr = pool;
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_free_pbn_lock_pool() - Free a PBN lock pool.
+ * @pool: The lock pool to free.
+ *
+ * This also frees all the PBN locks it allocated, so the caller must ensure that all locks have
+ * been returned to the pool.
+ */
+static void free_pbn_lock_pool(struct pbn_lock_pool *pool)
+{
+	if (pool == NULL)
+		return;
+
+	ASSERT_LOG_ONLY(pool->borrowed == 0,
+			"All PBN locks must be returned to the pool before it is freed, but %zu locks are still on loan",
+			pool->borrowed);
+	UDS_FREE(pool);
+}
+
+/**
+ * borrow_pbn_lock_from_pool() - Borrow a PBN lock from the pool and initialize it with the
+ *				 provided type.
+ * @pool: The pool from which to borrow.
+ * @type: The type with which to initialize the lock.
+ * @lock_ptr:  A pointer to receive the borrowed lock.
+ *
+ * Pools do not grow on demand or allocate memory, so this will fail if the pool is empty. Borrowed
+ * locks are still associated with this pool and must be returned to only this pool.
+ *
+ * Return: VDO_SUCCESS, or VDO_LOCK_ERROR if the pool is empty.
+ */
+static int __must_check
+borrow_pbn_lock_from_pool(struct pbn_lock_pool *pool,
+			  enum pbn_lock_type type,
+			  struct pbn_lock **lock_ptr)
+{
+	int result;
+	struct list_head *idle_entry;
+	idle_pbn_lock *idle;
+
+	if (pool->borrowed >= pool->capacity)
+		return uds_log_error_strerror(VDO_LOCK_ERROR, "no free PBN locks left to borrow");
+	pool->borrowed += 1;
+
+	result = ASSERT(!list_empty(&pool->idle_list),
+			"idle list should not be empty if pool not at capacity");
+	if (result != VDO_SUCCESS)
+		return result;
+
+	idle_entry = pool->idle_list.prev;
+	list_del(idle_entry);
+	memset(idle_entry, 0, sizeof(*idle_entry));
+
+	idle = list_entry(idle_entry, idle_pbn_lock, entry);
+	idle->lock.holder_count = 0;
+	set_pbn_lock_type(&idle->lock, type);
+
+	*lock_ptr = &idle->lock;
+	return VDO_SUCCESS;
+}
+
+/**
+ * initialize_zone() - Initialize a physical zone.
+ * @vdo: The vdo to which the zone will belong.
+ * @zones: The physical_zones to which the zone being initialized belongs
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int initialize_zone(struct vdo *vdo, struct physical_zones *zones)
+{
+	int result;
+	zone_count_t zone_number = zones->zone_count;
+	struct physical_zone *zone = &zones->zones[zone_number];
+
+	result = vdo_make_int_map(VDO_LOCK_MAP_CAPACITY, 0, &zone->pbn_operations);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = make_pbn_lock_pool(LOCK_POOL_CAPACITY, &zone->lock_pool);
+	if (result != VDO_SUCCESS) {
+		vdo_free_int_map(zone->pbn_operations);
+		return result;
+	}
+
+	zone->zone_number = zone_number;
+	zone->thread_id = vdo->thread_config.physical_threads[zone_number];
+	zone->allocator = &vdo->depot->allocators[zone_number];
+	zone->next = &zones->zones[(zone_number + 1) % vdo->thread_config.physical_zone_count];
+	result = vdo_make_default_thread(vdo, zone->thread_id);
+	if (result != VDO_SUCCESS) {
+		free_pbn_lock_pool(UDS_FORGET(zone->lock_pool));
+		vdo_free_int_map(zone->pbn_operations);
+		return result;
+	}
+	return result;
+}
+
+/**
+ * vdo_make_physical_zones() - Make the physical zones for a vdo.
+ * @vdo: The vdo being constructed
+ * @zones_ptr: A pointer to hold the zones
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_make_physical_zones(struct vdo *vdo, struct physical_zones **zones_ptr)
+{
+	struct physical_zones *zones;
+	int result;
+	zone_count_t zone_count = vdo->thread_config.physical_zone_count;
+
+	if (zone_count == 0)
+		return VDO_SUCCESS;
+
+	result = UDS_ALLOCATE_EXTENDED(struct physical_zones,
+				       zone_count,
+				       struct physical_zone,
+				       __func__,
+				       &zones);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	for (zones->zone_count = 0; zones->zone_count < zone_count; zones->zone_count++) {
+		result = initialize_zone(vdo, zones);
+		if (result != VDO_SUCCESS) {
+			vdo_free_physical_zones(zones);
+			return result;
+		}
+	}
+
+	*zones_ptr = zones;
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_free_physical_zones() - Destroy the physical zones.
+ * @zones: The zones to free.
+ */
+void vdo_free_physical_zones(struct physical_zones *zones)
+{
+	zone_count_t index;
+
+	if (zones == NULL)
+		return;
+
+	for (index = 0; index < zones->zone_count; index++) {
+		struct physical_zone *zone = &zones->zones[index];
+
+		free_pbn_lock_pool(UDS_FORGET(zone->lock_pool));
+		vdo_free_int_map(UDS_FORGET(zone->pbn_operations));
+	}
+
+	UDS_FREE(zones);
+}
+
+/**
+ * vdo_get_physical_zone_pbn_lock() - Get the lock on a PBN if one exists.
+ * @zone: The physical zone responsible for the PBN.
+ * @pbn: The physical block number whose lock is desired.
+ *
+ * Return: The lock or NULL if the PBN is not locked.
+ */
+struct pbn_lock *
+vdo_get_physical_zone_pbn_lock(struct physical_zone *zone, physical_block_number_t pbn)
+{
+	return ((zone == NULL) ? NULL : vdo_int_map_get(zone->pbn_operations, pbn));
+}
+
+/**
+ * vdo_attempt_physical_zone_pbn_lock() - Attempt to lock a physical block in the zone responsible
+ *					  for it.
+ * @zone: The physical zone responsible for the PBN.
+ * @pbn: The physical block number to lock.
+ * @type: The type with which to initialize a new lock.
+ * @lock_ptr:  A pointer to receive the lock, existing or new.
+ *
+ * If the PBN is already locked, the existing lock will be returned. Otherwise, a new lock instance
+ * will be borrowed from the pool, initialized, and returned. The lock owner will be NULL for a new
+ * lock acquired by the caller, who is responsible for setting that field promptly. The lock owner
+ * will be non-NULL when there is already an existing lock on the PBN.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_attempt_physical_zone_pbn_lock(struct physical_zone *zone,
+				       physical_block_number_t pbn,
+				       enum pbn_lock_type type,
+				       struct pbn_lock **lock_ptr)
+{
+	/*
+	 * Borrow and prepare a lock from the pool so we don't have to do two int_map accesses in
+	 * the common case of no lock contention.
+	 */
+	struct pbn_lock *lock, *new_lock = NULL;
+	int result;
+
+	result = borrow_pbn_lock_from_pool(zone->lock_pool, type, &new_lock);
+	if (result != VDO_SUCCESS) {
+		ASSERT_LOG_ONLY(false, "must always be able to borrow a PBN lock");
+		return result;
+	}
+
+	result = vdo_int_map_put(zone->pbn_operations, pbn, new_lock, false, (void **) &lock);
+	if (result != VDO_SUCCESS) {
+		return_pbn_lock_to_pool(zone->lock_pool, new_lock);
+		return result;
+	}
+
+	if (lock != NULL) {
+		/* The lock is already held, so we don't need the borrowed one. */
+		return_pbn_lock_to_pool(zone->lock_pool, UDS_FORGET(new_lock));
+		result = ASSERT(lock->holder_count > 0,
+				"physical block %llu lock held",
+				(unsigned long long) pbn);
+		if (result != VDO_SUCCESS)
+			return result;
+		*lock_ptr = lock;
+	} else {
+		*lock_ptr = new_lock;
+	}
+	return VDO_SUCCESS;
+}
+
+/**
+ * allocate_and_lock_block() - Attempt to allocate a block from this zone.
+ * @allocation: The struct allocation of the data_vio attempting to allocate.
+ *
+ * If a block is allocated, the recipient will also hold a lock on it.
+ *
+ * Return: VDO_SUCESSS if a block was allocated, or an error code.
+ */
+static int allocate_and_lock_block(struct allocation *allocation)
+{
+	int result;
+	struct pbn_lock *lock;
+
+	ASSERT_LOG_ONLY(allocation->lock == NULL,
+			"must not allocate a block while already holding a lock on one");
+
+	result = vdo_allocate_block(allocation->zone->allocator, &allocation->pbn);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = vdo_attempt_physical_zone_pbn_lock(allocation->zone,
+						    allocation->pbn,
+						    allocation->write_lock_type,
+						    &lock);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	if (lock->holder_count > 0)
+		/* This block is already locked, which should be impossible. */
+		return uds_log_error_strerror(VDO_LOCK_ERROR,
+					      "Newly allocated block %llu was spuriously locked (holder_count=%u)",
+					      (unsigned long long) allocation->pbn,
+					      lock->holder_count);
+
+	/* We've successfully acquired a new lock, so mark it as ours. */
+	lock->holder_count += 1;
+	allocation->lock = lock;
+	vdo_assign_pbn_lock_provisional_reference(lock);
+	return VDO_SUCCESS;
+}
+
+/**
+ * retry_allocation() - Retry allocating a block now that we're done waiting for scrubbing.
+ * @waiter: The allocating_vio that was waiting to allocate.
+ * @context: The context (unused).
+ */
+static void retry_allocation(struct waiter *waiter, void *context __always_unused)
+{
+	struct data_vio *data_vio = waiter_as_data_vio(waiter);
+
+	/* Now that some slab has scrubbed, restart the allocation process. */
+	data_vio->allocation.wait_for_clean_slab = false;
+	data_vio->allocation.first_allocation_zone = data_vio->allocation.zone->zone_number;
+	continue_data_vio(data_vio);
+}
+
+/**
+ * continue_allocating() - Continue searching for an allocation by enqueuing to wait for scrubbing
+ *			   or switching to the next zone.
+ * @data_vio: The data_vio attempting to get an allocation.
+ *
+ * This method should only be called from the error handler set in data_vio_allocate_data_block.
+ *
+ * Return: true if the allocation process has continued in another zone.
+ */
+static bool continue_allocating(struct data_vio *data_vio)
+{
+	struct allocation *allocation = &data_vio->allocation;
+	struct physical_zone *zone = allocation->zone;
+	struct vdo_completion *completion = &data_vio->vio.completion;
+	int result = VDO_SUCCESS;
+	bool was_waiting = allocation->wait_for_clean_slab;
+	bool tried_all = (allocation->first_allocation_zone == zone->next->zone_number);
+
+	vdo_reset_completion(completion);
+
+	if (tried_all && !was_waiting) {
+		/*
+		 * We've already looked in all the zones, and found nothing. So go through the
+		 * zones again, and wait for each to scrub before trying to allocate.
+		 */
+		allocation->wait_for_clean_slab = true;
+		allocation->first_allocation_zone = zone->zone_number;
+	}
+
+	if (allocation->wait_for_clean_slab) {
+		data_vio->waiter.callback = retry_allocation;
+		result = vdo_enqueue_clean_slab_waiter(zone->allocator, &data_vio->waiter);
+		if (result == VDO_SUCCESS)
+			/* We've enqueued to wait for a slab to be scrubbed. */
+			return true;
+
+		if ((result != VDO_NO_SPACE) || (was_waiting && tried_all)) {
+			vdo_set_completion_result(completion, result);
+			return false;
+		}
+	}
+
+	allocation->zone = zone->next;
+	completion->callback_thread_id = allocation->zone->thread_id;
+	vdo_launch_completion(completion);
+	return true;
+}
+
+/**
+ * vdo_allocate_block_in_zone() - Attempt to allocate a block in the current physical zone, and if
+ *				  that fails try the next if possible.
+ * @data_vio: The data_vio needing an allocation.
+ *
+ * Return: true if a block was allocated, if not the data_vio will have been dispatched so the
+ *         caller must not touch it.
+ */
+bool vdo_allocate_block_in_zone(struct data_vio *data_vio)
+{
+	int result = allocate_and_lock_block(&data_vio->allocation);
+
+	if (result == VDO_SUCCESS)
+		return true;
+
+	if ((result != VDO_NO_SPACE) || !continue_allocating(data_vio))
+		continue_data_vio_with_error(data_vio, result);
+
+	return false;
+}
+
+/**
+ * vdo_release_physical_zone_pbn_lock() - Release a physical block lock if it is held and return it
+ *                                        to the lock pool.
+ * @zone: The physical zone in which the lock was obtained.
+ * @locked_pbn: The physical block number to unlock.
+ * @lock: The lock being released.
+ *
+ * It must be the last live reference, as if the memory were being freed (the
+ * lock memory will re-initialized or zeroed).
+ */
+void vdo_release_physical_zone_pbn_lock(struct physical_zone *zone,
+					physical_block_number_t locked_pbn,
+					struct pbn_lock *lock)
+{
+	struct pbn_lock *holder;
+
+	if (lock == NULL)
+		return;
+
+	ASSERT_LOG_ONLY(lock->holder_count > 0, "should not be releasing a lock that is not held");
+
+	lock->holder_count -= 1;
+	if (lock->holder_count > 0)
+		/* The lock was shared and is still referenced, so don't release it yet. */
+		return;
+
+	holder = vdo_int_map_remove(zone->pbn_operations, locked_pbn);
+	ASSERT_LOG_ONLY((lock == holder),
+			"physical block lock mismatch for block %llu",
+			(unsigned long long) locked_pbn);
+
+	release_pbn_lock_provisional_reference(lock, locked_pbn, zone->allocator);
+	return_pbn_lock_to_pool(zone->lock_pool, lock);
+}
+
+/**
+ * vdo_dump_physical_zone() - Dump information about a physical zone to the log for debugging.
+ * @zone: The zone to dump.
+ */
+void vdo_dump_physical_zone(const struct physical_zone *zone)
+{
+	vdo_dump_block_allocator(zone->allocator);
+}
diff --git a/drivers/md/dm-vdo/physical-zone.h b/drivers/md/dm-vdo/physical-zone.h
new file mode 100644
index 00000000000..55b7341ff39
--- /dev/null
+++ b/drivers/md/dm-vdo/physical-zone.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_PHYSICAL_ZONE_H
+#define VDO_PHYSICAL_ZONE_H
+
+#include <linux/atomic.h>
+
+#include "types.h"
+
+/*
+ * The type of a PBN lock.
+ */
+enum pbn_lock_type {
+	VIO_READ_LOCK,
+	VIO_WRITE_LOCK,
+	VIO_BLOCK_MAP_WRITE_LOCK,
+};
+
+struct pbn_lock_implementation;
+
+/*
+ * A PBN lock.
+ */
+struct pbn_lock {
+	/* The implementation of the lock */
+	const struct pbn_lock_implementation *implementation;
+
+	/* The number of VIOs holding or sharing this lock */
+	data_vio_count_t holder_count;
+	/*
+	 * The number of compressed block writers holding a share of this lock while they are
+	 * acquiring a reference to the PBN.
+	 */
+	u8 fragment_locks;
+
+	/* Whether the locked PBN has been provisionally referenced on behalf of the lock holder. */
+	bool has_provisional_reference;
+
+	/*
+	 * For read locks, the number of references that were known to be available on the locked
+	 * block at the time the lock was acquired.
+	 */
+	u8 increment_limit;
+
+	/*
+	 * For read locks, the number of data_vios that have tried to claim one of the available
+	 * increments during the lifetime of the lock. Each claim will first increment this
+	 * counter, so it can exceed the increment limit.
+	 */
+	atomic_t increments_claimed;
+};
+
+struct physical_zone {
+	/* Which physical zone this is */
+	zone_count_t zone_number;
+	/* The thread ID for this zone */
+	thread_id_t thread_id;
+	/* In progress operations keyed by PBN */
+	struct int_map *pbn_operations;
+	/* Pool of unused pbn_lock instances */
+	struct pbn_lock_pool *lock_pool;
+	/* The block allocator for this zone */
+	struct block_allocator *allocator;
+	/* The next zone from which to attempt an allocation */
+	struct physical_zone *next;
+};
+
+struct physical_zones {
+	/* The number of zones */
+	zone_count_t zone_count;
+	/* The physical zones themselves */
+	struct physical_zone zones[];
+};
+
+bool __must_check vdo_is_pbn_read_lock(const struct pbn_lock *lock);
+void vdo_downgrade_pbn_write_lock(struct pbn_lock *lock, bool compressed_write);
+bool __must_check vdo_claim_pbn_lock_increment(struct pbn_lock *lock);
+
+/**
+ * vdo_pbn_lock_has_provisional_reference() - Check whether a PBN lock has a provisional reference.
+ * @lock: The PBN lock.
+ */
+static inline bool vdo_pbn_lock_has_provisional_reference(struct pbn_lock *lock)
+{
+	return ((lock != NULL) && lock->has_provisional_reference);
+}
+
+void vdo_assign_pbn_lock_provisional_reference(struct pbn_lock *lock);
+void vdo_unassign_pbn_lock_provisional_reference(struct pbn_lock *lock);
+
+int __must_check vdo_make_physical_zones(struct vdo *vdo, struct physical_zones **zones_ptr);
+
+void vdo_free_physical_zones(struct physical_zones *zones);
+
+struct pbn_lock * __must_check
+vdo_get_physical_zone_pbn_lock(struct physical_zone *zone, physical_block_number_t pbn);
+
+int __must_check
+vdo_attempt_physical_zone_pbn_lock(struct physical_zone *zone,
+				   physical_block_number_t pbn,
+				   enum pbn_lock_type type,
+				   struct pbn_lock **lock_ptr);
+
+bool __must_check vdo_allocate_block_in_zone(struct data_vio *data_vio);
+
+void vdo_release_physical_zone_pbn_lock(struct physical_zone *zone,
+					physical_block_number_t locked_pbn,
+					struct pbn_lock *lock);
+
+void vdo_dump_physical_zone(const struct physical_zone *zone);
+
+#endif /* VDO_PHYSICAL_ZONE_H */
diff --git a/drivers/md/dm-vdo/pointer-map.c b/drivers/md/dm-vdo/pointer-map.c
new file mode 100644
index 00000000000..8511fef70ee
--- /dev/null
+++ b/drivers/md/dm-vdo/pointer-map.c
@@ -0,0 +1,691 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+/**
+ * DOC:
+ *
+ * Hash table implementation of a map from integers to pointers, implemented using the Hopscotch
+ * Hashing algorithm by Herlihy, Shavit, and Tzafrir (see
+ * http://en.wikipedia.org/wiki/Hopscotch_hashing). This implementation does not contain any of the
+ * locking/concurrency features of the algorithm, just the collision resolution scheme.
+ *
+ * Hopscotch Hashing is based on hashing with open addressing and linear probing. All the entries
+ * are stored in a fixed array of buckets, with no dynamic allocation for collisions. Unlike linear
+ * probing, all the entries that hash to a given bucket are stored within a fixed neighborhood
+ * starting at that bucket. Chaining is effectively represented as a bit vector relative to each
+ * bucket instead of as pointers or explicit offsets.
+ *
+ * When an empty bucket cannot be found within a given neighborhood, subsequent neighborhoods are
+ * searched, and one or more entries will "hop" into those neighborhoods. When this process works,
+ * an empty bucket will move into the desired neighborhood, allowing the entry to be added. When
+ * that process fails (typically when the buckets are around 90% full), the table must be resized
+ * and the all entries rehashed and added to the expanded table.
+ *
+ * Unlike linear probing, the number of buckets that must be searched in the worst case has a fixed
+ * upper bound (the size of the neighborhood). Those entries occupy a small number of memory cache
+ * lines, leading to improved use of the cache (fewer misses on both successful and unsuccessful
+ * searches). Hopscotch hashing outperforms linear probing at much higher load factors, so even
+ * with the increased memory burden for maintaining the hop vectors, less memory is needed to
+ * achieve that performance. Hopscotch is also immune to "contamination" from deleting entries
+ * since entries are genuinely removed instead of being replaced by a placeholder.
+ *
+ * The published description of the algorithm used a bit vector, but the paper alludes to an offset
+ * scheme which is used by this implementation. Since the entries in the neighborhood are within N
+ * entries of the hash bucket at the start of the neighborhood, a pair of small offset fields each
+ * log2(N) bits wide is all that's needed to maintain the hops as a linked list. In order to encode
+ * "no next hop" (i.e. NULL) as the natural initial value of zero, the offsets are biased by one
+ * (i.e. 0 => NULL, 1 => offset=0, 2 => offset=1, etc.) We can represent neighborhoods of up to 255
+ * entries with just 8+8=16 bits per entry. The hop list is sorted by hop offset so the first entry
+ * in the list is always the bucket closest to the start of the neighborhood.
+ *
+ * While individual accesses tend to be very fast, the table resize operations are very, very
+ * expensive. If an upper bound on the latency of adding an entry to the table is needed, we either
+ * need to ensure the table is pre-sized to be large enough so no resize is ever needed, or we'll
+ * need to develop an approach to incrementally resize the table.
+ */
+
+#include "pointer-map.h"
+
+#include <linux/minmax.h>
+
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+
+enum {
+	DEFAULT_CAPACITY = 16, /* the number of neighborhoods in a new table */
+	NEIGHBORHOOD = 255, /* the number of buckets in each neighborhood */
+	MAX_PROBES = 1024, /* limit on the number of probes for a free bucket */
+	NULL_HOP_OFFSET = 0, /* the hop offset value terminating the hop list */
+	DEFAULT_LOAD = 75 /* a compromise between memory use and performance */
+};
+
+/**
+ * struct bucket - Hash buckets.
+ *
+ * Buckets are packed together to reduce memory usage and improve cache efficiency. It would be
+ * tempting to encode the hop offsets separately and maintain alignment of key/value pairs, but
+ * it's crucial to keep the hop fields near the buckets that they use them so they'll tend to share
+ * cache lines.
+ */
+struct __packed bucket {
+	/**
+	 * @first_hop: The biased offset of the first entry in the hop list of the neighborhood
+	 * that hashes to this bucket.
+	 */
+	u8 first_hop;
+	/** @next_hop: the biased offset of the next bucket in the hop list. */
+	u8 next_hop;
+	/** @key: The key stored in this bucket. */
+	const void *key;
+	/** @value: The value stored in this bucket (NULL if empty). */
+	void *value;
+};
+
+/**
+ * struct pointer_map - The concrete definition of the opaque pointer_map type.
+ *
+ * To avoid having to wrap the neighborhoods of the last entries back around to the start of the
+ * bucket array, we allocate a few more buckets at the end of the array instead, which is why
+ * capacity and bucket_count are different.
+ */
+struct pointer_map {
+	/** @size: The number of entries stored in the map. */
+	size_t size;
+	/** @capacity: The number of neighborhoods in the map. */
+	size_t capacity;
+	/** @bucket_count: The number of buckets in the bucket array. */
+	size_t bucket_count;
+	/** @buckets: The array of hash buckets. */
+	struct bucket *buckets;
+	/** @comparator: The function for comparing keys for equality. */
+	pointer_key_comparator *comparator;
+	/** @hasher: The function for getting a hash code from a key. */
+	pointer_key_hasher *hasher;
+};
+
+/**
+ * allocate_buckets() - Initialize a pointer_map.
+ * @map: The map to initialize.
+ * @capacity: The initial capacity of the map.
+ *
+ * Return: UDS_SUCCESS or an error code.
+ */
+static int allocate_buckets(struct pointer_map *map, size_t capacity)
+{
+	map->size = 0;
+	map->capacity = capacity;
+
+	/*
+	 * Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a full neighborhood
+	 * without have to wrap back around to element zero.
+	 */
+	map->bucket_count = capacity + (NEIGHBORHOOD - 1);
+	return UDS_ALLOCATE(map->bucket_count,
+			    struct bucket,
+			    "pointer_map buckets",
+			    &map->buckets);
+}
+
+/**
+ * vdo_make_pointer_map() - Allocate and initialize a pointer_map.
+ * @initial_capacity: The number of entries the map should initially be capable of holding (zero
+ *                    tells the map to use its own small default).
+ * @initial_load: The load factor of the map, expressed as an integer percentage (typically in the
+ * range 50 to 90, with zero telling the map to use its own default).
+ * @comparator: The function to use to compare the referents of two pointer keys for equality.
+ * @hasher: The function to use obtain the hash code associated with each pointer key
+ * @map_ptr: A pointer to hold the new pointer_map.
+ *
+ * Return: UDS_SUCCESS or an error code.
+ */
+int vdo_make_pointer_map(size_t initial_capacity,
+			 unsigned int initial_load,
+			 pointer_key_comparator comparator,
+			 pointer_key_hasher hasher,
+			 struct pointer_map **map_ptr)
+{
+	int result;
+	struct pointer_map *map;
+	size_t capacity;
+
+	/* Use the default initial load if the caller did not specify one. */
+	if (initial_load == 0)
+		initial_load = DEFAULT_LOAD;
+	if (initial_load > 100)
+		return UDS_INVALID_ARGUMENT;
+
+	result = UDS_ALLOCATE(1, struct pointer_map, "pointer_map", &map);
+	if (result != UDS_SUCCESS)
+		return result;
+
+	map->hasher = hasher;
+	map->comparator = comparator;
+
+	/* Use the default capacity if the caller did not specify one. */
+	capacity = (initial_capacity > 0) ? initial_capacity : DEFAULT_CAPACITY;
+
+	/*
+	 * Scale up the capacity by the specified initial load factor. (i.e to hold 1000 entries at
+	 * 80% load we need a capacity of 1250)
+	 */
+	capacity = capacity * 100 / initial_load;
+
+	result = allocate_buckets(map, capacity);
+	if (result != UDS_SUCCESS) {
+		vdo_free_pointer_map(UDS_FORGET(map));
+		return result;
+	}
+
+	*map_ptr = map;
+	return UDS_SUCCESS;
+}
+
+/**
+ * vdo_free_pointer_map() - Free a pointer_map.
+ * @map: The pointer_map to free.
+ *
+ * The map does not own the pointer keys and values stored in the map and they are not freed by
+ * this call.
+ */
+void vdo_free_pointer_map(struct pointer_map *map)
+{
+	if (map == NULL)
+		return;
+
+	UDS_FREE(UDS_FORGET(map->buckets));
+	UDS_FREE(UDS_FORGET(map));
+}
+
+/**
+ * vdo_pointer_map_size() - Get the number of entries stored in a pointer_map.
+ * @map: The pointer_map to query.
+ *
+ * Return: The number of entries in the map.
+ */
+size_t vdo_pointer_map_size(const struct pointer_map *map)
+{
+	return map->size;
+}
+
+/**
+ * dereference_hop() - Convert a biased hop offset within a neighborhood to a pointer to the bucket
+ *                     it references.
+ * @neighborhood: The first bucket in the neighborhood.
+ * @hop_offset: The biased hop offset to the desired bucket.
+ *
+ * Return: NULL if hop_offset is zero, otherwise a pointer to the bucket in the neighborhood at
+ *         hop_offset - 1.
+ */
+static struct bucket *dereference_hop(struct bucket *neighborhood, unsigned int hop_offset)
+{
+	if (hop_offset == NULL_HOP_OFFSET)
+		return NULL;
+
+	STATIC_ASSERT(NULL_HOP_OFFSET == 0);
+	return &neighborhood[hop_offset - 1];
+}
+
+/**
+ * insert_in_hop_list() - Add a bucket into the hop list for the neighborhood, inserting it into
+ *                        the list so the hop list remains sorted by hop offset.
+ * @neighborhood: The first bucket in the neighborhood.
+ * @new_bucket: The bucket to add to the hop list.
+ */
+static void insert_in_hop_list(struct bucket *neighborhood, struct bucket *new_bucket)
+{
+	/* Zero indicates a NULL hop offset, so bias the hop offset by one. */
+	int hop_offset = 1 + (new_bucket - neighborhood);
+
+	/* Handle the special case of adding a bucket at the start of the list. */
+	int next_hop = neighborhood->first_hop;
+
+	if ((next_hop == NULL_HOP_OFFSET) || (next_hop > hop_offset)) {
+		new_bucket->next_hop = next_hop;
+		neighborhood->first_hop = hop_offset;
+		return;
+	}
+
+	/* Search the hop list for the insertion point that maintains the sort order. */
+	for (;;) {
+		struct bucket *bucket = dereference_hop(neighborhood, next_hop);
+
+		next_hop = bucket->next_hop;
+
+		if ((next_hop == NULL_HOP_OFFSET) || (next_hop > hop_offset)) {
+			new_bucket->next_hop = next_hop;
+			bucket->next_hop = hop_offset;
+			return;
+		}
+	}
+}
+
+/**
+ * select_bucket() - Select and return the hash bucket for a given search key.
+ * @map: The map to search.
+ * @key: The mapping key.
+ */
+static struct bucket *select_bucket(const struct pointer_map *map, const void *key)
+{
+	/*
+	 * Scale the 32-bit hash to a bucket index by treating it as a binary fraction and
+	 * multiplying that by the capacity. If the hash is uniformly distributed over [0 ..
+	 * 2^32-1], then (hash * capacity / 2^32) should be uniformly distributed over [0 ..
+	 * capacity-1]. The multiply and shift is much faster than a divide (modulus) on X86 CPUs.
+	 */
+	u64 hash = map->hasher(key);
+
+	return &map->buckets[(hash * map->capacity) >> 32];
+}
+
+/**
+ * search_hop_list() - Search the hop list.
+ * @map: The map being searched.
+ * @bucket: The map bucket to search for the key.
+ * @key: The mapping key.
+ * @previous_ptr: if not NULL, a pointer in which to store the bucket in the list preceding the one
+ *                that had the matching key.
+ *
+ * Searches the hop list associated with given hash bucket for a given search key. If the key is
+ * found, returns a pointer to the entry (bucket or collision), otherwise returns NULL.
+ *
+ * Return: an entry that matches the key, or NULL if not found.
+ */
+static struct bucket *search_hop_list(struct pointer_map *map,
+				      struct bucket *bucket,
+				      const void *key,
+				      struct bucket **previous_ptr)
+{
+	struct bucket *previous = NULL;
+	unsigned int next_hop = bucket->first_hop;
+
+	while (next_hop != NULL_HOP_OFFSET) {
+		/* Check the neighboring bucket indexed by the offset for the desired key. */
+		struct bucket *entry = dereference_hop(bucket, next_hop);
+
+		if ((entry->value != NULL) && map->comparator(key, entry->key)) {
+			if (previous_ptr != NULL)
+				*previous_ptr = previous;
+			return entry;
+		}
+		next_hop = entry->next_hop;
+		previous = entry;
+	}
+	return NULL;
+}
+
+/**
+ * vdo_pointer_map_get() - Retrieve the value associated with a given key from the pointer_map.
+ * @map: The pointer_map to query.
+ * @key: The key to look up (may be NULL if the comparator and hasher functions support it).
+ *
+ * Return: the value associated with the given key, or NULL if the key is not mapped to any value.
+ */
+void *vdo_pointer_map_get(struct pointer_map *map, const void *key)
+{
+	struct bucket *match = search_hop_list(map, select_bucket(map, key), key, NULL);
+
+	return ((match != NULL) ? match->value : NULL);
+}
+
+/**
+ * resize_buckets() - Increase the number of hash buckets and rehash all the existing entries,
+ *                    storing them in the new buckets.
+ * @map: The map to resize.
+ */
+static int resize_buckets(struct pointer_map *map)
+{
+	int result;
+	size_t i;
+
+	/* Copy the top-level map data to the stack. */
+	struct pointer_map old_map = *map;
+
+	/* Re-initialize the map to be empty and 50% larger. */
+	size_t new_capacity = map->capacity / 2 * 3;
+
+	uds_log_info("%s: attempting resize from %zu to %zu, current size=%zu",
+		     __func__,
+		     map->capacity,
+		     new_capacity,
+		     map->size);
+	result = allocate_buckets(map, new_capacity);
+	if (result != UDS_SUCCESS) {
+		*map = old_map;
+		return result;
+	}
+
+	/* Populate the new hash table from the entries in the old bucket array. */
+	for (i = 0; i < old_map.bucket_count; i++) {
+		struct bucket *entry = &old_map.buckets[i];
+
+		if (entry->value == NULL)
+			continue;
+
+		result = vdo_pointer_map_put(map, entry->key, entry->value, true, NULL);
+		if (result != UDS_SUCCESS) {
+			/* Destroy the new partial map and restore the map from the stack. */
+			UDS_FREE(UDS_FORGET(map->buckets));
+			*map = old_map;
+			return result;
+		}
+	}
+
+	/* Destroy the old bucket array. */
+	UDS_FREE(UDS_FORGET(old_map.buckets));
+	return UDS_SUCCESS;
+}
+
+/**
+ * find_empty_bucket() - Probe the bucket array starting at the given bucket for the next empty
+ *                       bucket, returning a pointer to it.
+ * @map: The map containing the buckets to search.
+ * @bucket: The bucket at which to start probing.
+ * @max_probes: The maximum number of buckets to search.
+ *
+ * NULL will be returned if the search reaches the end of the bucket array or if the number of
+ * linear probes exceeds a specified limit.
+ *
+ * Return: The next empty bucket, or NULL if the search failed.
+ */
+static struct bucket *
+find_empty_bucket(struct pointer_map *map, struct bucket *bucket, unsigned int max_probes)
+{
+	/*
+	 * Limit the search to either the nearer of the end of the bucket array or a fixed distance
+	 * beyond the initial bucket.
+	 */
+	ptrdiff_t remaining = &map->buckets[map->bucket_count] - bucket;
+	struct bucket *sentinel = &bucket[min_t(ptrdiff_t, remaining, max_probes)];
+	struct bucket *entry;
+
+	for (entry = bucket; entry < sentinel; entry++)
+		if (entry->value == NULL)
+			return entry;
+	return NULL;
+}
+
+/**
+ * move_empty_bucket() - Move an empty bucket closer to the start of the bucket array.
+ * @map: The map containing the bucket.
+
+ * @hole: The empty bucket to fill with an entry that precedes it in one of its enclosing
+ *        neighborhoods.
+ *
+ * This searches the neighborhoods that contain the empty bucket for a non-empty bucket closer to
+ * the start of the array. If such a bucket is found, this swaps the two buckets by moving the
+ * entry to the empty bucket.
+ *
+ * Return: The bucket that was vacated by moving its entry to the provided hole, or NULL if no
+ *         entry could be moved.
+ */
+static struct bucket *
+move_empty_bucket(struct pointer_map *map __always_unused, struct bucket *hole)
+{
+	/*
+	 * Examine every neighborhood that the empty bucket is part of, starting with the one in
+	 * which it is the last bucket. No boundary check is needed for the negative array
+	 * arithmetic since this function is only called when hole is at least NEIGHBORHOOD cells
+	 * deeper into the array than a valid bucket.
+	 */
+	struct bucket *bucket;
+
+	for (bucket = &hole[1 - NEIGHBORHOOD]; bucket < hole; bucket++) {
+		/*
+		 * Find the entry that is nearest to the bucket, which means it will be nearest to
+		 * the hash bucket whose neighborhood is full.
+		 */
+		struct bucket *new_hole = dereference_hop(bucket, bucket->first_hop);
+
+		if (new_hole == NULL)
+			/*
+			 * There are no buckets in this neighborhood that are in use by this one
+			 * (they must all be owned by overlapping neighborhoods).
+			 */
+			continue;
+
+		/*
+		 * Skip this bucket if its first entry is actually further away than the hole that
+		 * we're already trying to fill.
+		 */
+		if (hole < new_hole)
+			continue;
+
+		/*
+		 * We've found an entry in this neighborhood that we can "hop" further away, moving
+		 * the hole closer to the hash bucket, if not all the way into its neighborhood.
+		 */
+
+		/*
+		 * The entry that will be the new hole is the first bucket in the list, so setting
+		 * first_hop is all that's needed remove it from the list.
+		 */
+		bucket->first_hop = new_hole->next_hop;
+		new_hole->next_hop = NULL_HOP_OFFSET;
+
+		/* Move the entry into the original hole. */
+		hole->key = new_hole->key;
+		hole->value = new_hole->value;
+		new_hole->value = NULL;
+
+		/* Insert the filled hole into the hop list for the neighborhood. */
+		insert_in_hop_list(bucket, hole);
+		return new_hole;
+	}
+
+	/* We couldn't find an entry to relocate to the hole. */
+	return NULL;
+}
+
+/**
+ * update_mapping() - Find and update any existing mapping for a given key, returning the value
+ *                    associated with the key in the provided pointer.
+ * @map: The pointer_map to attempt to modify.
+ * @neighborhood: The first bucket in the neighborhood that would contain the search key.
+ * @key: The key with which to associate the new value.
+ * @new_value: The value to be associated with the key.
+ * @update: Whether to overwrite an existing value.
+ * @old_value_ptr: A pointer in which to store the old value (unmodified if no mapping was found).
+ *
+ * Return: true if the map contains a mapping for the key, false if it does not.
+ */
+static bool update_mapping(struct pointer_map *map,
+			   struct bucket *neighborhood,
+			   const void *key,
+			   void *new_value,
+			   bool update,
+			   void **old_value_ptr)
+{
+	struct bucket *bucket = search_hop_list(map, neighborhood, key, NULL);
+
+	if (bucket == NULL)
+		/* There is no bucket containing the key in the neighborhood. */
+		return false;
+
+	/*
+	 * Return the value of the current mapping (if desired) and update the mapping with the new
+	 * value (if desired).
+	 */
+	if (old_value_ptr != NULL)
+		*old_value_ptr = bucket->value;
+	if (update) {
+		/*
+		 * We're dropping the old key pointer on the floor here, assuming it's a property
+		 * of the value or that it's otherwise safe to just forget.
+		 */
+		bucket->key = key;
+		bucket->value = new_value;
+	}
+	return true;
+}
+
+/**
+ * find_or_make_vacancy() - Find an empty bucket in a specified neighborhood for a new mapping or
+ *                          attempt to re-arrange mappings so there is such a bucket.
+ * @map: The pointer_map to search or modify.
+ * @neighborhood: The first bucket in the neighborhood in which an empty bucket is needed for a new
+ *                mapping.
+ *
+ * This operation may fail (returning NULL) if an empty bucket is not available or could not be
+ * relocated to the neighborhood.
+ *
+ * Return: A pointer to an empty bucket in the desired neighborhood, or NULL if a vacancy could not
+ *         be found or arranged.
+ */
+static struct bucket *find_or_make_vacancy(struct pointer_map *map, struct bucket *neighborhood)
+{
+	/* Probe within and beyond the neighborhood for the first empty bucket. */
+	struct bucket *hole = find_empty_bucket(map, neighborhood, MAX_PROBES);
+
+	/*
+	 * Keep trying until the empty bucket is in the bucket's neighborhood or we are unable to
+	 * move it any closer by swapping it with a filled bucket.
+	 */
+	while (hole != NULL) {
+		int distance = hole - neighborhood;
+
+		if (distance < NEIGHBORHOOD)
+			/*
+			 * We've found or relocated an empty bucket close enough to the initial
+			 * hash bucket to be referenced by its hop vector.
+			 */
+			return hole;
+
+		/*
+		 * The nearest empty bucket isn't within the neighborhood that must contain the new
+		 * entry, so try to swap it with bucket that is closer.
+		 */
+		hole = move_empty_bucket(map, hole);
+	}
+
+	return NULL;
+}
+
+/**
+ * vdo_pointer_map_put() - Try to associate a value (a pointer) with an integer in a pointer_map.
+ * @map: The pointer_map to attempt to modify.
+ * @key: The key with which to associate the new value (may be NULL if the comparator and hasher
+ *       functions support it).
+ * @new_value: The value to be associated with the key.
+ * @update: Whether to overwrite an existing value.
+ * @old_value_ptr: A pointer in which to store either the old value (if the key was already mapped)
+ *                 or NULL if the map did not contain the key; NULL may be provided if the caller
+ *                 does not need to know the old value.
+ *
+ * If the map already contains a mapping for the provided key, the old value is only replaced with
+ * the specified value if update is true. In either case the old value is returned. If the map does
+ * not already contain a value for the specified key, the new value is added regardless of the
+ * value of update.
+ *
+ * If the value stored in the map is updated, then the key stored in the map will also be updated
+ * with the key provided by this call. The old key will not be returned due to the memory
+ * management assumptions described in the interface header comment.
+ *
+ * Return: UDS_SUCCESS or an error code.
+ */
+int vdo_pointer_map_put(struct pointer_map *map,
+			const void *key,
+			void *new_value,
+			bool update,
+			void **old_value_ptr)
+{
+	struct bucket *neighborhood, *bucket;
+
+	if (new_value == NULL)
+		return UDS_INVALID_ARGUMENT;
+
+	/*
+	 * Select the bucket at the start of the neighborhood that must contain any entry for the
+	 * provided key.
+	 */
+	neighborhood = select_bucket(map, key);
+
+	/*
+	 * Check whether the neighborhood already contains an entry for the key, in which case we
+	 * optionally update it, returning the old value.
+	 */
+	if (update_mapping(map, neighborhood, key, new_value, update, old_value_ptr))
+		return UDS_SUCCESS;
+
+	/*
+	 * Find an empty bucket in the desired neighborhood for the new entry or re-arrange entries
+	 * in the map so there is such a bucket. This operation will usually succeed; the loop body
+	 * will only be executed on the rare occasions that we have to resize the map.
+	 */
+	while ((bucket = find_or_make_vacancy(map, neighborhood)) == NULL) {
+		/*
+		 * There is no empty bucket in which to put the new entry in the current map, so
+		 * we're forced to allocate a new bucket array with a larger capacity, re-hash all
+		 * the entries into those buckets, and try again (a very expensive operation for
+		 * large maps).
+		 */
+		int result = resize_buckets(map);
+
+		if (result != UDS_SUCCESS)
+			return result;
+
+		/*
+		 * Resizing the map invalidates all pointers to buckets, so
+		 * recalculate the neighborhood pointer.
+		 */
+		neighborhood = select_bucket(map, key);
+	}
+
+	/* Put the new entry in the empty bucket, adding it to the neighborhood. */
+	bucket->key = key;
+	bucket->value = new_value;
+	insert_in_hop_list(neighborhood, bucket);
+	map->size += 1;
+
+	/*
+	 * There was no existing entry, so there was no old value to be
+	 * returned.
+	 */
+	if (old_value_ptr != NULL)
+		*old_value_ptr = NULL;
+	return UDS_SUCCESS;
+}
+
+/**
+ * vdo_pointer_map_remove() - Remove the mapping for a given key from the pointer_map.
+ * @map: The pointer_map from which to remove the mapping.
+ * @key: The key whose mapping is to be removed (may be NULL if the comparator and hasher functions
+ *       support it).
+ *
+ * Return: the value that was associated with the key, or NULL if it was not mapped.
+ */
+void *vdo_pointer_map_remove(struct pointer_map *map, const void *key)
+{
+	void *value;
+
+	/* Select the bucket to search and search it for an existing entry. */
+	struct bucket *bucket = select_bucket(map, key);
+	struct bucket *previous;
+	struct bucket *victim = search_hop_list(map, bucket, key, &previous);
+
+	if (victim == NULL)
+		/* There is no matching entry to remove. */
+		return NULL;
+
+	/*
+	 * We found an entry to remove. Save the mapped value to return later and empty the bucket.
+	 */
+	map->size -= 1;
+	value = victim->value;
+	victim->value = NULL;
+	victim->key = 0;
+
+	/* The victim bucket is now empty, but it still needs to be spliced out of the hop list. */
+	if (previous == NULL)
+		/* The victim is the head of the list, so swing first_hop. */
+		bucket->first_hop = victim->next_hop;
+	else
+		previous->next_hop = victim->next_hop;
+	victim->next_hop = NULL_HOP_OFFSET;
+
+	return value;
+}
diff --git a/drivers/md/dm-vdo/pointer-map.h b/drivers/md/dm-vdo/pointer-map.h
new file mode 100644
index 00000000000..18de2979174
--- /dev/null
+++ b/drivers/md/dm-vdo/pointer-map.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_POINTER_MAP_H
+#define VDO_POINTER_MAP_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+/*
+ * A pointer_map associates pointer values (<code>void *</code>) with the data referenced by
+ * pointer keys (<code>void *</code>). <code>NULL</code> pointer values are not supported. A
+ * <code>NULL</code> key value is supported when the instance's key comparator and hasher functions
+ * support it.
+ *
+ * The map is implemented as hash table, which should provide constant-time insert, query, and
+ * remove operations, although the insert may occasionally grow the table, which is linear in the
+ * number of entries in the map. The table will grow as needed to hold new entries, but will not
+ * shrink as entries are removed.
+ *
+ * The key and value pointers passed to the map are retained and used by the map, but are not owned
+ * by the map. Freeing the map does not attempt to free the pointers. The client is entirely
+ * responsible for the memory managment of the keys and values. The current interface and
+ * implementation assume that keys will be properties of the values, or that keys will not be
+ * memory managed, or that keys will not need to be freed as a result of being replaced when a key
+ * is re-mapped.
+ */
+
+struct pointer_map;
+
+/**
+ * typedef pointer_key_comparator - The prototype of functions that compare the referents of two
+ *                                  pointer keys for equality.
+ * @this_key: The first element to compare.
+ * @that_key: The second element to compare.
+ *
+ * If two keys are equal, then both keys must have the same the hash code associated with them by
+ * the hasher function defined below.
+ *
+ * Return: true if and only if the referents of the two key pointers are to be treated as the same
+ *         key by the map.
+ */
+typedef bool pointer_key_comparator(const void *this_key, const void *that_key);
+
+/**
+ * typedef pointer_key_hasher - The prototype of functions that get or calculate a hash code
+ *                              associated with the referent of pointer key.
+ * @key: The pointer key to hash.
+ *
+ * The hash code must be uniformly distributed over all u32 values. The hash code associated
+ * with a given key must not change while the key is in the map. If the comparator function says
+ * two keys are equal, then this function must return the same hash code for both keys. This
+ * function may be called many times for a key while an entry is stored for it in the map.
+ *
+ * Return: The hash code for the key.
+ */
+typedef u32 pointer_key_hasher(const void *key);
+
+int __must_check vdo_make_pointer_map(size_t initial_capacity,
+				      unsigned int initial_load,
+				      pointer_key_comparator comparator,
+				      pointer_key_hasher hasher,
+				      struct pointer_map **map_ptr);
+
+void vdo_free_pointer_map(struct pointer_map *map);
+
+size_t vdo_pointer_map_size(const struct pointer_map *map);
+
+void *vdo_pointer_map_get(struct pointer_map *map, const void *key);
+
+int __must_check vdo_pointer_map_put(struct pointer_map *map,
+				     const void *key,
+				     void *new_value,
+				     bool update,
+				     void **old_value_ptr);
+
+void *vdo_pointer_map_remove(struct pointer_map *map, const void *key);
+
+#endif /* VDO_POINTER_MAP_H */
diff --git a/drivers/md/dm-vdo/pool-sysfs-stats.c b/drivers/md/dm-vdo/pool-sysfs-stats.c
new file mode 100644
index 00000000000..411ea5c143a
--- /dev/null
+++ b/drivers/md/dm-vdo/pool-sysfs-stats.c
@@ -0,0 +1,2063 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include <linux/mutex.h>
+
+#include "logger.h"
+#include "string-utils.h"
+
+#include "dedupe.h"
+#include "pool-sysfs.h"
+#include "statistics.h"
+#include "vdo.h"
+
+struct pool_stats_attribute {
+	struct attribute attr;
+	ssize_t (*print)(struct vdo_statistics *stats, char *buf);
+};
+
+static ssize_t pool_stats_attr_show(struct kobject *directory,
+				    struct attribute *attr,
+				    char *buf)
+{
+	ssize_t size;
+	struct pool_stats_attribute *pool_stats_attr =
+		container_of(attr, struct pool_stats_attribute, attr);
+	struct vdo *vdo = container_of(directory, struct vdo, stats_directory);
+
+	if (pool_stats_attr->print == NULL)
+		return -EINVAL;
+
+	mutex_lock(&vdo->stats_mutex);
+	vdo_fetch_statistics(vdo, &vdo->stats_buffer);
+	size = pool_stats_attr->print(&vdo->stats_buffer, buf);
+	mutex_unlock(&vdo->stats_mutex);
+
+	return size;
+}
+
+const struct sysfs_ops vdo_pool_stats_sysfs_ops = {
+	.show = pool_stats_attr_show,
+	.store = NULL,
+};
+
+/* Number of blocks used for data */
+static ssize_t
+pool_stats_print_data_blocks_used(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->data_blocks_used);
+}
+
+static struct pool_stats_attribute pool_stats_attr_data_blocks_used = {
+	.attr = { .name = "data_blocks_used", .mode = 0444, },
+	.print = pool_stats_print_data_blocks_used,
+};
+
+/* Number of blocks used for VDO metadata */
+static ssize_t
+pool_stats_print_overhead_blocks_used(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->overhead_blocks_used);
+}
+
+static struct pool_stats_attribute pool_stats_attr_overhead_blocks_used = {
+	.attr = { .name = "overhead_blocks_used", .mode = 0444, },
+	.print = pool_stats_print_overhead_blocks_used,
+};
+
+/* Number of logical blocks that are currently mapped to physical blocks */
+static ssize_t
+pool_stats_print_logical_blocks_used(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->logical_blocks_used);
+}
+
+static struct pool_stats_attribute pool_stats_attr_logical_blocks_used = {
+	.attr = { .name = "logical_blocks_used", .mode = 0444, },
+	.print = pool_stats_print_logical_blocks_used,
+};
+
+/* number of physical blocks */
+static ssize_t
+pool_stats_print_physical_blocks(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->physical_blocks);
+}
+
+static struct pool_stats_attribute pool_stats_attr_physical_blocks = {
+	.attr = { .name = "physical_blocks", .mode = 0444, },
+	.print = pool_stats_print_physical_blocks,
+};
+
+/* number of logical blocks */
+static ssize_t
+pool_stats_print_logical_blocks(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->logical_blocks);
+}
+
+static struct pool_stats_attribute pool_stats_attr_logical_blocks = {
+	.attr = { .name = "logical_blocks", .mode = 0444, },
+	.print = pool_stats_print_logical_blocks,
+};
+
+/* Size of the block map page cache, in bytes */
+static ssize_t
+pool_stats_print_block_map_cache_size(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->block_map_cache_size);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_cache_size = {
+	.attr = { .name = "block_map_cache_size", .mode = 0444, },
+	.print = pool_stats_print_block_map_cache_size,
+};
+
+/* The physical block size */
+static ssize_t
+pool_stats_print_block_size(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->block_size);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_size = {
+	.attr = { .name = "block_size", .mode = 0444, },
+	.print = pool_stats_print_block_size,
+};
+
+/* Number of times the VDO has successfully recovered */
+static ssize_t
+pool_stats_print_complete_recoveries(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->complete_recoveries);
+}
+
+static struct pool_stats_attribute pool_stats_attr_complete_recoveries = {
+	.attr = { .name = "complete_recoveries", .mode = 0444, },
+	.print = pool_stats_print_complete_recoveries,
+};
+
+/* Number of times the VDO has recovered from read-only mode */
+static ssize_t
+pool_stats_print_read_only_recoveries(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->read_only_recoveries);
+}
+
+static struct pool_stats_attribute pool_stats_attr_read_only_recoveries = {
+	.attr = { .name = "read_only_recoveries", .mode = 0444, },
+	.print = pool_stats_print_read_only_recoveries,
+};
+
+/* String describing the operating mode of the VDO */
+static ssize_t
+pool_stats_print_mode(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%s\n", stats->mode);
+}
+
+static struct pool_stats_attribute pool_stats_attr_mode = {
+	.attr = { .name = "mode", .mode = 0444, },
+	.print = pool_stats_print_mode,
+};
+
+/* Whether the VDO is in recovery mode */
+static ssize_t
+pool_stats_print_in_recovery_mode(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%d\n", stats->in_recovery_mode);
+}
+
+static struct pool_stats_attribute pool_stats_attr_in_recovery_mode = {
+	.attr = { .name = "in_recovery_mode", .mode = 0444, },
+	.print = pool_stats_print_in_recovery_mode,
+};
+
+/* What percentage of recovery mode work has been completed */
+static ssize_t
+pool_stats_print_recovery_percentage(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%u\n", stats->recovery_percentage);
+}
+
+static struct pool_stats_attribute pool_stats_attr_recovery_percentage = {
+	.attr = { .name = "recovery_percentage", .mode = 0444, },
+	.print = pool_stats_print_recovery_percentage,
+};
+
+/* Number of compressed data items written since startup */
+static ssize_t
+pool_stats_print_packer_compressed_fragments_written(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->packer.compressed_fragments_written);
+}
+
+static struct pool_stats_attribute pool_stats_attr_packer_compressed_fragments_written = {
+	.attr = { .name = "packer_compressed_fragments_written", .mode = 0444, },
+	.print = pool_stats_print_packer_compressed_fragments_written,
+};
+
+/* Number of blocks containing compressed items written since startup */
+static ssize_t
+pool_stats_print_packer_compressed_blocks_written(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->packer.compressed_blocks_written);
+}
+
+static struct pool_stats_attribute pool_stats_attr_packer_compressed_blocks_written = {
+	.attr = { .name = "packer_compressed_blocks_written", .mode = 0444, },
+	.print = pool_stats_print_packer_compressed_blocks_written,
+};
+
+/* Number of VIOs that are pending in the packer */
+static ssize_t
+pool_stats_print_packer_compressed_fragments_in_packer(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->packer.compressed_fragments_in_packer);
+}
+
+static struct pool_stats_attribute pool_stats_attr_packer_compressed_fragments_in_packer = {
+	.attr = { .name = "packer_compressed_fragments_in_packer", .mode = 0444, },
+	.print = pool_stats_print_packer_compressed_fragments_in_packer,
+};
+
+/* The total number of slabs from which blocks may be allocated */
+static ssize_t
+pool_stats_print_allocator_slab_count(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->allocator.slab_count);
+}
+
+static struct pool_stats_attribute pool_stats_attr_allocator_slab_count = {
+	.attr = { .name = "allocator_slab_count", .mode = 0444, },
+	.print = pool_stats_print_allocator_slab_count,
+};
+
+/* The total number of slabs from which blocks have ever been allocated */
+static ssize_t
+pool_stats_print_allocator_slabs_opened(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->allocator.slabs_opened);
+}
+
+static struct pool_stats_attribute pool_stats_attr_allocator_slabs_opened = {
+	.attr = { .name = "allocator_slabs_opened", .mode = 0444, },
+	.print = pool_stats_print_allocator_slabs_opened,
+};
+
+/* The number of times since loading that a slab has been re-opened */
+static ssize_t
+pool_stats_print_allocator_slabs_reopened(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->allocator.slabs_reopened);
+}
+
+static struct pool_stats_attribute pool_stats_attr_allocator_slabs_reopened = {
+	.attr = { .name = "allocator_slabs_reopened", .mode = 0444, },
+	.print = pool_stats_print_allocator_slabs_reopened,
+};
+
+/* Number of times the on-disk journal was full */
+static ssize_t
+pool_stats_print_journal_disk_full(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->journal.disk_full);
+}
+
+static struct pool_stats_attribute pool_stats_attr_journal_disk_full = {
+	.attr = { .name = "journal_disk_full", .mode = 0444, },
+	.print = pool_stats_print_journal_disk_full,
+};
+
+/* Number of times the recovery journal requested slab journal commits. */
+static ssize_t
+pool_stats_print_journal_slab_journal_commits_requested(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->journal.slab_journal_commits_requested);
+}
+
+static struct pool_stats_attribute pool_stats_attr_journal_slab_journal_commits_requested = {
+	.attr = { .name = "journal_slab_journal_commits_requested", .mode = 0444, },
+	.print = pool_stats_print_journal_slab_journal_commits_requested,
+};
+
+/* The total number of items on which processing has started */
+static ssize_t
+pool_stats_print_journal_entries_started(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->journal.entries.started);
+}
+
+static struct pool_stats_attribute pool_stats_attr_journal_entries_started = {
+	.attr = { .name = "journal_entries_started", .mode = 0444, },
+	.print = pool_stats_print_journal_entries_started,
+};
+
+/* The total number of items for which a write operation has been issued */
+static ssize_t
+pool_stats_print_journal_entries_written(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->journal.entries.written);
+}
+
+static struct pool_stats_attribute pool_stats_attr_journal_entries_written = {
+	.attr = { .name = "journal_entries_written", .mode = 0444, },
+	.print = pool_stats_print_journal_entries_written,
+};
+
+/* The total number of items for which a write operation has completed */
+static ssize_t
+pool_stats_print_journal_entries_committed(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->journal.entries.committed);
+}
+
+static struct pool_stats_attribute pool_stats_attr_journal_entries_committed = {
+	.attr = { .name = "journal_entries_committed", .mode = 0444, },
+	.print = pool_stats_print_journal_entries_committed,
+};
+
+/* The total number of items on which processing has started */
+static ssize_t
+pool_stats_print_journal_blocks_started(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->journal.blocks.started);
+}
+
+static struct pool_stats_attribute pool_stats_attr_journal_blocks_started = {
+	.attr = { .name = "journal_blocks_started", .mode = 0444, },
+	.print = pool_stats_print_journal_blocks_started,
+};
+
+/* The total number of items for which a write operation has been issued */
+static ssize_t
+pool_stats_print_journal_blocks_written(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->journal.blocks.written);
+}
+
+static struct pool_stats_attribute pool_stats_attr_journal_blocks_written = {
+	.attr = { .name = "journal_blocks_written", .mode = 0444, },
+	.print = pool_stats_print_journal_blocks_written,
+};
+
+/* The total number of items for which a write operation has completed */
+static ssize_t
+pool_stats_print_journal_blocks_committed(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->journal.blocks.committed);
+}
+
+static struct pool_stats_attribute pool_stats_attr_journal_blocks_committed = {
+	.attr = { .name = "journal_blocks_committed", .mode = 0444, },
+	.print = pool_stats_print_journal_blocks_committed,
+};
+
+/* Number of times the on-disk journal was full */
+static ssize_t
+pool_stats_print_slab_journal_disk_full_count(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->slab_journal.disk_full_count);
+}
+
+static struct pool_stats_attribute pool_stats_attr_slab_journal_disk_full_count = {
+	.attr = { .name = "slab_journal_disk_full_count", .mode = 0444, },
+	.print = pool_stats_print_slab_journal_disk_full_count,
+};
+
+/* Number of times an entry was added over the flush threshold */
+static ssize_t
+pool_stats_print_slab_journal_flush_count(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->slab_journal.flush_count);
+}
+
+static struct pool_stats_attribute pool_stats_attr_slab_journal_flush_count = {
+	.attr = { .name = "slab_journal_flush_count", .mode = 0444, },
+	.print = pool_stats_print_slab_journal_flush_count,
+};
+
+/* Number of times an entry was added over the block threshold */
+static ssize_t
+pool_stats_print_slab_journal_blocked_count(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->slab_journal.blocked_count);
+}
+
+static struct pool_stats_attribute pool_stats_attr_slab_journal_blocked_count = {
+	.attr = { .name = "slab_journal_blocked_count", .mode = 0444, },
+	.print = pool_stats_print_slab_journal_blocked_count,
+};
+
+/* Number of times a tail block was written */
+static ssize_t
+pool_stats_print_slab_journal_blocks_written(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->slab_journal.blocks_written);
+}
+
+static struct pool_stats_attribute pool_stats_attr_slab_journal_blocks_written = {
+	.attr = { .name = "slab_journal_blocks_written", .mode = 0444, },
+	.print = pool_stats_print_slab_journal_blocks_written,
+};
+
+/* Number of times we had to wait for the tail to write */
+static ssize_t
+pool_stats_print_slab_journal_tail_busy_count(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->slab_journal.tail_busy_count);
+}
+
+static struct pool_stats_attribute pool_stats_attr_slab_journal_tail_busy_count = {
+	.attr = { .name = "slab_journal_tail_busy_count", .mode = 0444, },
+	.print = pool_stats_print_slab_journal_tail_busy_count,
+};
+
+/* Number of blocks written */
+static ssize_t
+pool_stats_print_slab_summary_blocks_written(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->slab_summary.blocks_written);
+}
+
+static struct pool_stats_attribute pool_stats_attr_slab_summary_blocks_written = {
+	.attr = { .name = "slab_summary_blocks_written", .mode = 0444, },
+	.print = pool_stats_print_slab_summary_blocks_written,
+};
+
+/* Number of reference blocks written */
+static ssize_t
+pool_stats_print_ref_counts_blocks_written(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->ref_counts.blocks_written);
+}
+
+static struct pool_stats_attribute pool_stats_attr_ref_counts_blocks_written = {
+	.attr = { .name = "ref_counts_blocks_written", .mode = 0444, },
+	.print = pool_stats_print_ref_counts_blocks_written,
+};
+
+/* number of dirty (resident) pages */
+static ssize_t
+pool_stats_print_block_map_dirty_pages(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%u\n", stats->block_map.dirty_pages);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_dirty_pages = {
+	.attr = { .name = "block_map_dirty_pages", .mode = 0444, },
+	.print = pool_stats_print_block_map_dirty_pages,
+};
+
+/* number of clean (resident) pages */
+static ssize_t
+pool_stats_print_block_map_clean_pages(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%u\n", stats->block_map.clean_pages);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_clean_pages = {
+	.attr = { .name = "block_map_clean_pages", .mode = 0444, },
+	.print = pool_stats_print_block_map_clean_pages,
+};
+
+/* number of free pages */
+static ssize_t
+pool_stats_print_block_map_free_pages(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%u\n", stats->block_map.free_pages);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_free_pages = {
+	.attr = { .name = "block_map_free_pages", .mode = 0444, },
+	.print = pool_stats_print_block_map_free_pages,
+};
+
+/* number of pages in failed state */
+static ssize_t
+pool_stats_print_block_map_failed_pages(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%u\n", stats->block_map.failed_pages);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_failed_pages = {
+	.attr = { .name = "block_map_failed_pages", .mode = 0444, },
+	.print = pool_stats_print_block_map_failed_pages,
+};
+
+/* number of pages incoming */
+static ssize_t
+pool_stats_print_block_map_incoming_pages(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%u\n", stats->block_map.incoming_pages);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_incoming_pages = {
+	.attr = { .name = "block_map_incoming_pages", .mode = 0444, },
+	.print = pool_stats_print_block_map_incoming_pages,
+};
+
+/* number of pages outgoing */
+static ssize_t
+pool_stats_print_block_map_outgoing_pages(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%u\n", stats->block_map.outgoing_pages);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_outgoing_pages = {
+	.attr = { .name = "block_map_outgoing_pages", .mode = 0444, },
+	.print = pool_stats_print_block_map_outgoing_pages,
+};
+
+/* how many times free page not avail */
+static ssize_t
+pool_stats_print_block_map_cache_pressure(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%u\n", stats->block_map.cache_pressure);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_cache_pressure = {
+	.attr = { .name = "block_map_cache_pressure", .mode = 0444, },
+	.print = pool_stats_print_block_map_cache_pressure,
+};
+
+/* number of get_vdo_page() calls for read */
+static ssize_t
+pool_stats_print_block_map_read_count(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->block_map.read_count);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_read_count = {
+	.attr = { .name = "block_map_read_count", .mode = 0444, },
+	.print = pool_stats_print_block_map_read_count,
+};
+
+/* number of get_vdo_page() calls for write */
+static ssize_t
+pool_stats_print_block_map_write_count(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->block_map.write_count);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_write_count = {
+	.attr = { .name = "block_map_write_count", .mode = 0444, },
+	.print = pool_stats_print_block_map_write_count,
+};
+
+/* number of times pages failed to read */
+static ssize_t
+pool_stats_print_block_map_failed_reads(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->block_map.failed_reads);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_failed_reads = {
+	.attr = { .name = "block_map_failed_reads", .mode = 0444, },
+	.print = pool_stats_print_block_map_failed_reads,
+};
+
+/* number of times pages failed to write */
+static ssize_t
+pool_stats_print_block_map_failed_writes(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->block_map.failed_writes);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_failed_writes = {
+	.attr = { .name = "block_map_failed_writes", .mode = 0444, },
+	.print = pool_stats_print_block_map_failed_writes,
+};
+
+/* number of gets that are reclaimed */
+static ssize_t
+pool_stats_print_block_map_reclaimed(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->block_map.reclaimed);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_reclaimed = {
+	.attr = { .name = "block_map_reclaimed", .mode = 0444, },
+	.print = pool_stats_print_block_map_reclaimed,
+};
+
+/* number of gets for outgoing pages */
+static ssize_t
+pool_stats_print_block_map_read_outgoing(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->block_map.read_outgoing);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_read_outgoing = {
+	.attr = { .name = "block_map_read_outgoing", .mode = 0444, },
+	.print = pool_stats_print_block_map_read_outgoing,
+};
+
+/* number of gets that were already there */
+static ssize_t
+pool_stats_print_block_map_found_in_cache(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->block_map.found_in_cache);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_found_in_cache = {
+	.attr = { .name = "block_map_found_in_cache", .mode = 0444, },
+	.print = pool_stats_print_block_map_found_in_cache,
+};
+
+/* number of gets requiring discard */
+static ssize_t
+pool_stats_print_block_map_discard_required(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->block_map.discard_required);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_discard_required = {
+	.attr = { .name = "block_map_discard_required", .mode = 0444, },
+	.print = pool_stats_print_block_map_discard_required,
+};
+
+/* number of gets enqueued for their page */
+static ssize_t
+pool_stats_print_block_map_wait_for_page(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->block_map.wait_for_page);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_wait_for_page = {
+	.attr = { .name = "block_map_wait_for_page", .mode = 0444, },
+	.print = pool_stats_print_block_map_wait_for_page,
+};
+
+/* number of gets that have to fetch */
+static ssize_t
+pool_stats_print_block_map_fetch_required(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->block_map.fetch_required);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_fetch_required = {
+	.attr = { .name = "block_map_fetch_required", .mode = 0444, },
+	.print = pool_stats_print_block_map_fetch_required,
+};
+
+/* number of page fetches */
+static ssize_t
+pool_stats_print_block_map_pages_loaded(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->block_map.pages_loaded);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_pages_loaded = {
+	.attr = { .name = "block_map_pages_loaded", .mode = 0444, },
+	.print = pool_stats_print_block_map_pages_loaded,
+};
+
+/* number of page saves */
+static ssize_t
+pool_stats_print_block_map_pages_saved(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->block_map.pages_saved);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_pages_saved = {
+	.attr = { .name = "block_map_pages_saved", .mode = 0444, },
+	.print = pool_stats_print_block_map_pages_saved,
+};
+
+/* the number of flushes issued */
+static ssize_t
+pool_stats_print_block_map_flush_count(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->block_map.flush_count);
+}
+
+static struct pool_stats_attribute pool_stats_attr_block_map_flush_count = {
+	.attr = { .name = "block_map_flush_count", .mode = 0444, },
+	.print = pool_stats_print_block_map_flush_count,
+};
+
+/* Number of times the UDS advice proved correct */
+static ssize_t
+pool_stats_print_hash_lock_dedupe_advice_valid(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->hash_lock.dedupe_advice_valid);
+}
+
+static struct pool_stats_attribute pool_stats_attr_hash_lock_dedupe_advice_valid = {
+	.attr = { .name = "hash_lock_dedupe_advice_valid", .mode = 0444, },
+	.print = pool_stats_print_hash_lock_dedupe_advice_valid,
+};
+
+/* Number of times the UDS advice proved incorrect */
+static ssize_t
+pool_stats_print_hash_lock_dedupe_advice_stale(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->hash_lock.dedupe_advice_stale);
+}
+
+static struct pool_stats_attribute pool_stats_attr_hash_lock_dedupe_advice_stale = {
+	.attr = { .name = "hash_lock_dedupe_advice_stale", .mode = 0444, },
+	.print = pool_stats_print_hash_lock_dedupe_advice_stale,
+};
+
+/* Number of writes with the same data as another in-flight write */
+static ssize_t
+pool_stats_print_hash_lock_concurrent_data_matches(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->hash_lock.concurrent_data_matches);
+}
+
+static struct pool_stats_attribute pool_stats_attr_hash_lock_concurrent_data_matches = {
+	.attr = { .name = "hash_lock_concurrent_data_matches", .mode = 0444, },
+	.print = pool_stats_print_hash_lock_concurrent_data_matches,
+};
+
+/* Number of writes whose hash collided with an in-flight write */
+static ssize_t
+pool_stats_print_hash_lock_concurrent_hash_collisions(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->hash_lock.concurrent_hash_collisions);
+}
+
+static struct pool_stats_attribute pool_stats_attr_hash_lock_concurrent_hash_collisions = {
+	.attr = { .name = "hash_lock_concurrent_hash_collisions", .mode = 0444, },
+	.print = pool_stats_print_hash_lock_concurrent_hash_collisions,
+};
+
+/* Current number of dedupe queries that are in flight */
+static ssize_t
+pool_stats_print_hash_lock_curr_dedupe_queries(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%u\n", stats->hash_lock.curr_dedupe_queries);
+}
+
+static struct pool_stats_attribute pool_stats_attr_hash_lock_curr_dedupe_queries = {
+	.attr = { .name = "hash_lock_curr_dedupe_queries", .mode = 0444, },
+	.print = pool_stats_print_hash_lock_curr_dedupe_queries,
+};
+
+/* number of times VDO got an invalid dedupe advice PBN from UDS */
+static ssize_t
+pool_stats_print_errors_invalid_advice_pbn_count(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->errors.invalid_advice_pbn_count);
+}
+
+static struct pool_stats_attribute pool_stats_attr_errors_invalid_advice_pbn_count = {
+	.attr = { .name = "errors_invalid_advice_pbn_count", .mode = 0444, },
+	.print = pool_stats_print_errors_invalid_advice_pbn_count,
+};
+
+/* number of times a VIO completed with a VDO_NO_SPACE error */
+static ssize_t
+pool_stats_print_errors_no_space_error_count(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->errors.no_space_error_count);
+}
+
+static struct pool_stats_attribute pool_stats_attr_errors_no_space_error_count = {
+	.attr = { .name = "errors_no_space_error_count", .mode = 0444, },
+	.print = pool_stats_print_errors_no_space_error_count,
+};
+
+/* number of times a VIO completed with a VDO_READ_ONLY error */
+static ssize_t
+pool_stats_print_errors_read_only_error_count(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->errors.read_only_error_count);
+}
+
+static struct pool_stats_attribute pool_stats_attr_errors_read_only_error_count = {
+	.attr = { .name = "errors_read_only_error_count", .mode = 0444, },
+	.print = pool_stats_print_errors_read_only_error_count,
+};
+
+/* The VDO instance */
+static ssize_t
+pool_stats_print_instance(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%u\n", stats->instance);
+}
+
+static struct pool_stats_attribute pool_stats_attr_instance = {
+	.attr = { .name = "instance", .mode = 0444, },
+	.print = pool_stats_print_instance,
+};
+
+/* Current number of active VIOs */
+static ssize_t
+pool_stats_print_current_vios_in_progress(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%u\n", stats->current_vios_in_progress);
+}
+
+static struct pool_stats_attribute pool_stats_attr_current_vios_in_progress = {
+	.attr = { .name = "current_vios_in_progress", .mode = 0444, },
+	.print = pool_stats_print_current_vios_in_progress,
+};
+
+/* Maximum number of active VIOs */
+static ssize_t
+pool_stats_print_max_vios(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%u\n", stats->max_vios);
+}
+
+static struct pool_stats_attribute pool_stats_attr_max_vios = {
+	.attr = { .name = "max_vios", .mode = 0444, },
+	.print = pool_stats_print_max_vios,
+};
+
+/* Number of times the UDS index was too slow in responding */
+static ssize_t
+pool_stats_print_dedupe_advice_timeouts(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->dedupe_advice_timeouts);
+}
+
+static struct pool_stats_attribute pool_stats_attr_dedupe_advice_timeouts = {
+	.attr = { .name = "dedupe_advice_timeouts", .mode = 0444, },
+	.print = pool_stats_print_dedupe_advice_timeouts,
+};
+
+/* Number of flush requests submitted to the storage device */
+static ssize_t
+pool_stats_print_flush_out(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->flush_out);
+}
+
+static struct pool_stats_attribute pool_stats_attr_flush_out = {
+	.attr = { .name = "flush_out", .mode = 0444, },
+	.print = pool_stats_print_flush_out,
+};
+
+/* Logical block size */
+static ssize_t
+pool_stats_print_logical_block_size(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->logical_block_size);
+}
+
+static struct pool_stats_attribute pool_stats_attr_logical_block_size = {
+	.attr = { .name = "logical_block_size", .mode = 0444, },
+	.print = pool_stats_print_logical_block_size,
+};
+
+/* Number of REQ_OP_READ bios */
+static ssize_t
+pool_stats_print_bios_in_read(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in.read);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_read = {
+	.attr = { .name = "bios_in_read", .mode = 0444, },
+	.print = pool_stats_print_bios_in_read,
+};
+
+/* Number of REQ_OP_WRITE bios with data */
+static ssize_t
+pool_stats_print_bios_in_write(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in.write);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_write = {
+	.attr = { .name = "bios_in_write", .mode = 0444, },
+	.print = pool_stats_print_bios_in_write,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH and containing no data */
+static ssize_t
+pool_stats_print_bios_in_empty_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in.empty_flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_empty_flush = {
+	.attr = { .name = "bios_in_empty_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_in_empty_flush,
+};
+
+/* Number of REQ_OP_DISCARD bios */
+static ssize_t
+pool_stats_print_bios_in_discard(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in.discard);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_discard = {
+	.attr = { .name = "bios_in_discard", .mode = 0444, },
+	.print = pool_stats_print_bios_in_discard,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH */
+static ssize_t
+pool_stats_print_bios_in_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in.flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_flush = {
+	.attr = { .name = "bios_in_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_in_flush,
+};
+
+/* Number of bios tagged with REQ_FUA */
+static ssize_t
+pool_stats_print_bios_in_fua(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in.fua);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_fua = {
+	.attr = { .name = "bios_in_fua", .mode = 0444, },
+	.print = pool_stats_print_bios_in_fua,
+};
+
+/* Number of REQ_OP_READ bios */
+static ssize_t
+pool_stats_print_bios_in_partial_read(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in_partial.read);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_partial_read = {
+	.attr = { .name = "bios_in_partial_read", .mode = 0444, },
+	.print = pool_stats_print_bios_in_partial_read,
+};
+
+/* Number of REQ_OP_WRITE bios with data */
+static ssize_t
+pool_stats_print_bios_in_partial_write(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in_partial.write);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_partial_write = {
+	.attr = { .name = "bios_in_partial_write", .mode = 0444, },
+	.print = pool_stats_print_bios_in_partial_write,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH and containing no data */
+static ssize_t
+pool_stats_print_bios_in_partial_empty_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in_partial.empty_flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_partial_empty_flush = {
+	.attr = { .name = "bios_in_partial_empty_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_in_partial_empty_flush,
+};
+
+/* Number of REQ_OP_DISCARD bios */
+static ssize_t
+pool_stats_print_bios_in_partial_discard(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in_partial.discard);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_partial_discard = {
+	.attr = { .name = "bios_in_partial_discard", .mode = 0444, },
+	.print = pool_stats_print_bios_in_partial_discard,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH */
+static ssize_t
+pool_stats_print_bios_in_partial_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in_partial.flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_partial_flush = {
+	.attr = { .name = "bios_in_partial_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_in_partial_flush,
+};
+
+/* Number of bios tagged with REQ_FUA */
+static ssize_t
+pool_stats_print_bios_in_partial_fua(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in_partial.fua);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_partial_fua = {
+	.attr = { .name = "bios_in_partial_fua", .mode = 0444, },
+	.print = pool_stats_print_bios_in_partial_fua,
+};
+
+/* Number of REQ_OP_READ bios */
+static ssize_t
+pool_stats_print_bios_out_read(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_out.read);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_out_read = {
+	.attr = { .name = "bios_out_read", .mode = 0444, },
+	.print = pool_stats_print_bios_out_read,
+};
+
+/* Number of REQ_OP_WRITE bios with data */
+static ssize_t
+pool_stats_print_bios_out_write(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_out.write);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_out_write = {
+	.attr = { .name = "bios_out_write", .mode = 0444, },
+	.print = pool_stats_print_bios_out_write,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH and containing no data */
+static ssize_t
+pool_stats_print_bios_out_empty_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_out.empty_flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_out_empty_flush = {
+	.attr = { .name = "bios_out_empty_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_out_empty_flush,
+};
+
+/* Number of REQ_OP_DISCARD bios */
+static ssize_t
+pool_stats_print_bios_out_discard(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_out.discard);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_out_discard = {
+	.attr = { .name = "bios_out_discard", .mode = 0444, },
+	.print = pool_stats_print_bios_out_discard,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH */
+static ssize_t
+pool_stats_print_bios_out_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_out.flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_out_flush = {
+	.attr = { .name = "bios_out_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_out_flush,
+};
+
+/* Number of bios tagged with REQ_FUA */
+static ssize_t
+pool_stats_print_bios_out_fua(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_out.fua);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_out_fua = {
+	.attr = { .name = "bios_out_fua", .mode = 0444, },
+	.print = pool_stats_print_bios_out_fua,
+};
+
+/* Number of REQ_OP_READ bios */
+static ssize_t
+pool_stats_print_bios_meta_read(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_meta.read);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_meta_read = {
+	.attr = { .name = "bios_meta_read", .mode = 0444, },
+	.print = pool_stats_print_bios_meta_read,
+};
+
+/* Number of REQ_OP_WRITE bios with data */
+static ssize_t
+pool_stats_print_bios_meta_write(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_meta.write);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_meta_write = {
+	.attr = { .name = "bios_meta_write", .mode = 0444, },
+	.print = pool_stats_print_bios_meta_write,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH and containing no data */
+static ssize_t
+pool_stats_print_bios_meta_empty_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_meta.empty_flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_meta_empty_flush = {
+	.attr = { .name = "bios_meta_empty_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_meta_empty_flush,
+};
+
+/* Number of REQ_OP_DISCARD bios */
+static ssize_t
+pool_stats_print_bios_meta_discard(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_meta.discard);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_meta_discard = {
+	.attr = { .name = "bios_meta_discard", .mode = 0444, },
+	.print = pool_stats_print_bios_meta_discard,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH */
+static ssize_t
+pool_stats_print_bios_meta_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_meta.flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_meta_flush = {
+	.attr = { .name = "bios_meta_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_meta_flush,
+};
+
+/* Number of bios tagged with REQ_FUA */
+static ssize_t
+pool_stats_print_bios_meta_fua(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_meta.fua);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_meta_fua = {
+	.attr = { .name = "bios_meta_fua", .mode = 0444, },
+	.print = pool_stats_print_bios_meta_fua,
+};
+
+/* Number of REQ_OP_READ bios */
+static ssize_t
+pool_stats_print_bios_journal_read(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_journal.read);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_journal_read = {
+	.attr = { .name = "bios_journal_read", .mode = 0444, },
+	.print = pool_stats_print_bios_journal_read,
+};
+
+/* Number of REQ_OP_WRITE bios with data */
+static ssize_t
+pool_stats_print_bios_journal_write(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_journal.write);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_journal_write = {
+	.attr = { .name = "bios_journal_write", .mode = 0444, },
+	.print = pool_stats_print_bios_journal_write,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH and containing no data */
+static ssize_t
+pool_stats_print_bios_journal_empty_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_journal.empty_flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_journal_empty_flush = {
+	.attr = { .name = "bios_journal_empty_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_journal_empty_flush,
+};
+
+/* Number of REQ_OP_DISCARD bios */
+static ssize_t
+pool_stats_print_bios_journal_discard(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_journal.discard);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_journal_discard = {
+	.attr = { .name = "bios_journal_discard", .mode = 0444, },
+	.print = pool_stats_print_bios_journal_discard,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH */
+static ssize_t
+pool_stats_print_bios_journal_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_journal.flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_journal_flush = {
+	.attr = { .name = "bios_journal_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_journal_flush,
+};
+
+/* Number of bios tagged with REQ_FUA */
+static ssize_t
+pool_stats_print_bios_journal_fua(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_journal.fua);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_journal_fua = {
+	.attr = { .name = "bios_journal_fua", .mode = 0444, },
+	.print = pool_stats_print_bios_journal_fua,
+};
+
+/* Number of REQ_OP_READ bios */
+static ssize_t
+pool_stats_print_bios_page_cache_read(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_page_cache.read);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_page_cache_read = {
+	.attr = { .name = "bios_page_cache_read", .mode = 0444, },
+	.print = pool_stats_print_bios_page_cache_read,
+};
+
+/* Number of REQ_OP_WRITE bios with data */
+static ssize_t
+pool_stats_print_bios_page_cache_write(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_page_cache.write);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_page_cache_write = {
+	.attr = { .name = "bios_page_cache_write", .mode = 0444, },
+	.print = pool_stats_print_bios_page_cache_write,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH and containing no data */
+static ssize_t
+pool_stats_print_bios_page_cache_empty_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_page_cache.empty_flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_page_cache_empty_flush = {
+	.attr = { .name = "bios_page_cache_empty_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_page_cache_empty_flush,
+};
+
+/* Number of REQ_OP_DISCARD bios */
+static ssize_t
+pool_stats_print_bios_page_cache_discard(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_page_cache.discard);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_page_cache_discard = {
+	.attr = { .name = "bios_page_cache_discard", .mode = 0444, },
+	.print = pool_stats_print_bios_page_cache_discard,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH */
+static ssize_t
+pool_stats_print_bios_page_cache_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_page_cache.flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_page_cache_flush = {
+	.attr = { .name = "bios_page_cache_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_page_cache_flush,
+};
+
+/* Number of bios tagged with REQ_FUA */
+static ssize_t
+pool_stats_print_bios_page_cache_fua(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_page_cache.fua);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_page_cache_fua = {
+	.attr = { .name = "bios_page_cache_fua", .mode = 0444, },
+	.print = pool_stats_print_bios_page_cache_fua,
+};
+
+/* Number of REQ_OP_READ bios */
+static ssize_t
+pool_stats_print_bios_out_completed_read(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_out_completed.read);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_out_completed_read = {
+	.attr = { .name = "bios_out_completed_read", .mode = 0444, },
+	.print = pool_stats_print_bios_out_completed_read,
+};
+
+/* Number of REQ_OP_WRITE bios with data */
+static ssize_t
+pool_stats_print_bios_out_completed_write(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_out_completed.write);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_out_completed_write = {
+	.attr = { .name = "bios_out_completed_write", .mode = 0444, },
+	.print = pool_stats_print_bios_out_completed_write,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH and containing no data */
+static ssize_t
+pool_stats_print_bios_out_completed_empty_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_out_completed.empty_flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_out_completed_empty_flush = {
+	.attr = { .name = "bios_out_completed_empty_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_out_completed_empty_flush,
+};
+
+/* Number of REQ_OP_DISCARD bios */
+static ssize_t
+pool_stats_print_bios_out_completed_discard(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_out_completed.discard);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_out_completed_discard = {
+	.attr = { .name = "bios_out_completed_discard", .mode = 0444, },
+	.print = pool_stats_print_bios_out_completed_discard,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH */
+static ssize_t
+pool_stats_print_bios_out_completed_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_out_completed.flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_out_completed_flush = {
+	.attr = { .name = "bios_out_completed_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_out_completed_flush,
+};
+
+/* Number of bios tagged with REQ_FUA */
+static ssize_t
+pool_stats_print_bios_out_completed_fua(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_out_completed.fua);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_out_completed_fua = {
+	.attr = { .name = "bios_out_completed_fua", .mode = 0444, },
+	.print = pool_stats_print_bios_out_completed_fua,
+};
+
+/* Number of REQ_OP_READ bios */
+static ssize_t
+pool_stats_print_bios_meta_completed_read(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_meta_completed.read);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_read = {
+	.attr = { .name = "bios_meta_completed_read", .mode = 0444, },
+	.print = pool_stats_print_bios_meta_completed_read,
+};
+
+/* Number of REQ_OP_WRITE bios with data */
+static ssize_t
+pool_stats_print_bios_meta_completed_write(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_meta_completed.write);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_write = {
+	.attr = { .name = "bios_meta_completed_write", .mode = 0444, },
+	.print = pool_stats_print_bios_meta_completed_write,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH and containing no data */
+static ssize_t
+pool_stats_print_bios_meta_completed_empty_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_meta_completed.empty_flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_empty_flush = {
+	.attr = { .name = "bios_meta_completed_empty_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_meta_completed_empty_flush,
+};
+
+/* Number of REQ_OP_DISCARD bios */
+static ssize_t
+pool_stats_print_bios_meta_completed_discard(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_meta_completed.discard);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_discard = {
+	.attr = { .name = "bios_meta_completed_discard", .mode = 0444, },
+	.print = pool_stats_print_bios_meta_completed_discard,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH */
+static ssize_t
+pool_stats_print_bios_meta_completed_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_meta_completed.flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_flush = {
+	.attr = { .name = "bios_meta_completed_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_meta_completed_flush,
+};
+
+/* Number of bios tagged with REQ_FUA */
+static ssize_t
+pool_stats_print_bios_meta_completed_fua(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_meta_completed.fua);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_fua = {
+	.attr = { .name = "bios_meta_completed_fua", .mode = 0444, },
+	.print = pool_stats_print_bios_meta_completed_fua,
+};
+
+/* Number of REQ_OP_READ bios */
+static ssize_t
+pool_stats_print_bios_journal_completed_read(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_journal_completed.read);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_read = {
+	.attr = { .name = "bios_journal_completed_read", .mode = 0444, },
+	.print = pool_stats_print_bios_journal_completed_read,
+};
+
+/* Number of REQ_OP_WRITE bios with data */
+static ssize_t
+pool_stats_print_bios_journal_completed_write(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_journal_completed.write);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_write = {
+	.attr = { .name = "bios_journal_completed_write", .mode = 0444, },
+	.print = pool_stats_print_bios_journal_completed_write,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH and containing no data */
+static ssize_t
+pool_stats_print_bios_journal_completed_empty_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_journal_completed.empty_flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_empty_flush = {
+	.attr = { .name = "bios_journal_completed_empty_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_journal_completed_empty_flush,
+};
+
+/* Number of REQ_OP_DISCARD bios */
+static ssize_t
+pool_stats_print_bios_journal_completed_discard(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_journal_completed.discard);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_discard = {
+	.attr = { .name = "bios_journal_completed_discard", .mode = 0444, },
+	.print = pool_stats_print_bios_journal_completed_discard,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH */
+static ssize_t
+pool_stats_print_bios_journal_completed_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_journal_completed.flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_flush = {
+	.attr = { .name = "bios_journal_completed_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_journal_completed_flush,
+};
+
+/* Number of bios tagged with REQ_FUA */
+static ssize_t
+pool_stats_print_bios_journal_completed_fua(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_journal_completed.fua);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_fua = {
+	.attr = { .name = "bios_journal_completed_fua", .mode = 0444, },
+	.print = pool_stats_print_bios_journal_completed_fua,
+};
+
+/* Number of REQ_OP_READ bios */
+static ssize_t
+pool_stats_print_bios_page_cache_completed_read(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.read);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_read = {
+	.attr = { .name = "bios_page_cache_completed_read", .mode = 0444, },
+	.print = pool_stats_print_bios_page_cache_completed_read,
+};
+
+/* Number of REQ_OP_WRITE bios with data */
+static ssize_t
+pool_stats_print_bios_page_cache_completed_write(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.write);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_write = {
+	.attr = { .name = "bios_page_cache_completed_write", .mode = 0444, },
+	.print = pool_stats_print_bios_page_cache_completed_write,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH and containing no data */
+static ssize_t
+pool_stats_print_bios_page_cache_completed_empty_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.empty_flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_empty_flush = {
+	.attr = { .name = "bios_page_cache_completed_empty_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_page_cache_completed_empty_flush,
+};
+
+/* Number of REQ_OP_DISCARD bios */
+static ssize_t
+pool_stats_print_bios_page_cache_completed_discard(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.discard);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_discard = {
+	.attr = { .name = "bios_page_cache_completed_discard", .mode = 0444, },
+	.print = pool_stats_print_bios_page_cache_completed_discard,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH */
+static ssize_t
+pool_stats_print_bios_page_cache_completed_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_flush = {
+	.attr = { .name = "bios_page_cache_completed_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_page_cache_completed_flush,
+};
+
+/* Number of bios tagged with REQ_FUA */
+static ssize_t
+pool_stats_print_bios_page_cache_completed_fua(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_page_cache_completed.fua);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_fua = {
+	.attr = { .name = "bios_page_cache_completed_fua", .mode = 0444, },
+	.print = pool_stats_print_bios_page_cache_completed_fua,
+};
+
+/* Number of REQ_OP_READ bios */
+static ssize_t
+pool_stats_print_bios_acknowledged_read(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_acknowledged.read);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_read = {
+	.attr = { .name = "bios_acknowledged_read", .mode = 0444, },
+	.print = pool_stats_print_bios_acknowledged_read,
+};
+
+/* Number of REQ_OP_WRITE bios with data */
+static ssize_t
+pool_stats_print_bios_acknowledged_write(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_acknowledged.write);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_write = {
+	.attr = { .name = "bios_acknowledged_write", .mode = 0444, },
+	.print = pool_stats_print_bios_acknowledged_write,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH and containing no data */
+static ssize_t
+pool_stats_print_bios_acknowledged_empty_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_acknowledged.empty_flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_empty_flush = {
+	.attr = { .name = "bios_acknowledged_empty_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_acknowledged_empty_flush,
+};
+
+/* Number of REQ_OP_DISCARD bios */
+static ssize_t
+pool_stats_print_bios_acknowledged_discard(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_acknowledged.discard);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_discard = {
+	.attr = { .name = "bios_acknowledged_discard", .mode = 0444, },
+	.print = pool_stats_print_bios_acknowledged_discard,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH */
+static ssize_t
+pool_stats_print_bios_acknowledged_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_acknowledged.flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_flush = {
+	.attr = { .name = "bios_acknowledged_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_acknowledged_flush,
+};
+
+/* Number of bios tagged with REQ_FUA */
+static ssize_t
+pool_stats_print_bios_acknowledged_fua(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_acknowledged.fua);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_fua = {
+	.attr = { .name = "bios_acknowledged_fua", .mode = 0444, },
+	.print = pool_stats_print_bios_acknowledged_fua,
+};
+
+/* Number of REQ_OP_READ bios */
+static ssize_t
+pool_stats_print_bios_acknowledged_partial_read(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.read);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_read = {
+	.attr = { .name = "bios_acknowledged_partial_read", .mode = 0444, },
+	.print = pool_stats_print_bios_acknowledged_partial_read,
+};
+
+/* Number of REQ_OP_WRITE bios with data */
+static ssize_t
+pool_stats_print_bios_acknowledged_partial_write(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.write);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_write = {
+	.attr = { .name = "bios_acknowledged_partial_write", .mode = 0444, },
+	.print = pool_stats_print_bios_acknowledged_partial_write,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH and containing no data */
+static ssize_t
+pool_stats_print_bios_acknowledged_partial_empty_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.empty_flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_empty_flush = {
+	.attr = { .name = "bios_acknowledged_partial_empty_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_acknowledged_partial_empty_flush,
+};
+
+/* Number of REQ_OP_DISCARD bios */
+static ssize_t
+pool_stats_print_bios_acknowledged_partial_discard(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.discard);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_discard = {
+	.attr = { .name = "bios_acknowledged_partial_discard", .mode = 0444, },
+	.print = pool_stats_print_bios_acknowledged_partial_discard,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH */
+static ssize_t
+pool_stats_print_bios_acknowledged_partial_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_flush = {
+	.attr = { .name = "bios_acknowledged_partial_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_acknowledged_partial_flush,
+};
+
+/* Number of bios tagged with REQ_FUA */
+static ssize_t
+pool_stats_print_bios_acknowledged_partial_fua(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_acknowledged_partial.fua);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_fua = {
+	.attr = { .name = "bios_acknowledged_partial_fua", .mode = 0444, },
+	.print = pool_stats_print_bios_acknowledged_partial_fua,
+};
+
+/* Number of REQ_OP_READ bios */
+static ssize_t
+pool_stats_print_bios_in_progress_read(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in_progress.read);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_progress_read = {
+	.attr = { .name = "bios_in_progress_read", .mode = 0444, },
+	.print = pool_stats_print_bios_in_progress_read,
+};
+
+/* Number of REQ_OP_WRITE bios with data */
+static ssize_t
+pool_stats_print_bios_in_progress_write(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in_progress.write);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_progress_write = {
+	.attr = { .name = "bios_in_progress_write", .mode = 0444, },
+	.print = pool_stats_print_bios_in_progress_write,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH and containing no data */
+static ssize_t
+pool_stats_print_bios_in_progress_empty_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in_progress.empty_flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_progress_empty_flush = {
+	.attr = { .name = "bios_in_progress_empty_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_in_progress_empty_flush,
+};
+
+/* Number of REQ_OP_DISCARD bios */
+static ssize_t
+pool_stats_print_bios_in_progress_discard(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in_progress.discard);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_progress_discard = {
+	.attr = { .name = "bios_in_progress_discard", .mode = 0444, },
+	.print = pool_stats_print_bios_in_progress_discard,
+};
+
+/* Number of bios tagged with REQ_PREFLUSH */
+static ssize_t
+pool_stats_print_bios_in_progress_flush(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in_progress.flush);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_progress_flush = {
+	.attr = { .name = "bios_in_progress_flush", .mode = 0444, },
+	.print = pool_stats_print_bios_in_progress_flush,
+};
+
+/* Number of bios tagged with REQ_FUA */
+static ssize_t
+pool_stats_print_bios_in_progress_fua(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->bios_in_progress.fua);
+}
+
+static struct pool_stats_attribute pool_stats_attr_bios_in_progress_fua = {
+	.attr = { .name = "bios_in_progress_fua", .mode = 0444, },
+	.print = pool_stats_print_bios_in_progress_fua,
+};
+
+/* Tracked bytes currently allocated. */
+static ssize_t
+pool_stats_print_memory_usage_bytes_used(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->memory_usage.bytes_used);
+}
+
+static struct pool_stats_attribute pool_stats_attr_memory_usage_bytes_used = {
+	.attr = { .name = "memory_usage_bytes_used", .mode = 0444, },
+	.print = pool_stats_print_memory_usage_bytes_used,
+};
+
+/* Maximum tracked bytes allocated. */
+static ssize_t
+pool_stats_print_memory_usage_peak_bytes_used(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->memory_usage.peak_bytes_used);
+}
+
+static struct pool_stats_attribute pool_stats_attr_memory_usage_peak_bytes_used = {
+	.attr = { .name = "memory_usage_peak_bytes_used", .mode = 0444, },
+	.print = pool_stats_print_memory_usage_peak_bytes_used,
+};
+
+/* Number of records stored in the index */
+static ssize_t
+pool_stats_print_index_entries_indexed(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->index.entries_indexed);
+}
+
+static struct pool_stats_attribute pool_stats_attr_index_entries_indexed = {
+	.attr = { .name = "index_entries_indexed", .mode = 0444, },
+	.print = pool_stats_print_index_entries_indexed,
+};
+
+/* Number of post calls that found an existing entry */
+static ssize_t
+pool_stats_print_index_posts_found(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->index.posts_found);
+}
+
+static struct pool_stats_attribute pool_stats_attr_index_posts_found = {
+	.attr = { .name = "index_posts_found", .mode = 0444, },
+	.print = pool_stats_print_index_posts_found,
+};
+
+/* Number of post calls that added a new entry */
+static ssize_t
+pool_stats_print_index_posts_not_found(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->index.posts_not_found);
+}
+
+static struct pool_stats_attribute pool_stats_attr_index_posts_not_found = {
+	.attr = { .name = "index_posts_not_found", .mode = 0444, },
+	.print = pool_stats_print_index_posts_not_found,
+};
+
+/* Number of query calls that found an existing entry */
+static ssize_t
+pool_stats_print_index_queries_found(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->index.queries_found);
+}
+
+static struct pool_stats_attribute pool_stats_attr_index_queries_found = {
+	.attr = { .name = "index_queries_found", .mode = 0444, },
+	.print = pool_stats_print_index_queries_found,
+};
+
+/* Number of query calls that added a new entry */
+static ssize_t
+pool_stats_print_index_queries_not_found(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->index.queries_not_found);
+}
+
+static struct pool_stats_attribute pool_stats_attr_index_queries_not_found = {
+	.attr = { .name = "index_queries_not_found", .mode = 0444, },
+	.print = pool_stats_print_index_queries_not_found,
+};
+
+/* Number of update calls that found an existing entry */
+static ssize_t
+pool_stats_print_index_updates_found(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->index.updates_found);
+}
+
+static struct pool_stats_attribute pool_stats_attr_index_updates_found = {
+	.attr = { .name = "index_updates_found", .mode = 0444, },
+	.print = pool_stats_print_index_updates_found,
+};
+
+/* Number of update calls that added a new entry */
+static ssize_t
+pool_stats_print_index_updates_not_found(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->index.updates_not_found);
+}
+
+static struct pool_stats_attribute pool_stats_attr_index_updates_not_found = {
+	.attr = { .name = "index_updates_not_found", .mode = 0444, },
+	.print = pool_stats_print_index_updates_not_found,
+};
+
+/* Number of entries discarded */
+static ssize_t
+pool_stats_print_index_entries_discarded(struct vdo_statistics *stats, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->index.entries_discarded);
+}
+
+static struct pool_stats_attribute pool_stats_attr_index_entries_discarded = {
+	.attr = { .name = "index_entries_discarded", .mode = 0444, },
+	.print = pool_stats_print_index_entries_discarded,
+};
+
+struct attribute *vdo_pool_stats_attrs[] = {
+	&pool_stats_attr_data_blocks_used.attr,
+	&pool_stats_attr_overhead_blocks_used.attr,
+	&pool_stats_attr_logical_blocks_used.attr,
+	&pool_stats_attr_physical_blocks.attr,
+	&pool_stats_attr_logical_blocks.attr,
+	&pool_stats_attr_block_map_cache_size.attr,
+	&pool_stats_attr_block_size.attr,
+	&pool_stats_attr_complete_recoveries.attr,
+	&pool_stats_attr_read_only_recoveries.attr,
+	&pool_stats_attr_mode.attr,
+	&pool_stats_attr_in_recovery_mode.attr,
+	&pool_stats_attr_recovery_percentage.attr,
+	&pool_stats_attr_packer_compressed_fragments_written.attr,
+	&pool_stats_attr_packer_compressed_blocks_written.attr,
+	&pool_stats_attr_packer_compressed_fragments_in_packer.attr,
+	&pool_stats_attr_allocator_slab_count.attr,
+	&pool_stats_attr_allocator_slabs_opened.attr,
+	&pool_stats_attr_allocator_slabs_reopened.attr,
+	&pool_stats_attr_journal_disk_full.attr,
+	&pool_stats_attr_journal_slab_journal_commits_requested.attr,
+	&pool_stats_attr_journal_entries_started.attr,
+	&pool_stats_attr_journal_entries_written.attr,
+	&pool_stats_attr_journal_entries_committed.attr,
+	&pool_stats_attr_journal_blocks_started.attr,
+	&pool_stats_attr_journal_blocks_written.attr,
+	&pool_stats_attr_journal_blocks_committed.attr,
+	&pool_stats_attr_slab_journal_disk_full_count.attr,
+	&pool_stats_attr_slab_journal_flush_count.attr,
+	&pool_stats_attr_slab_journal_blocked_count.attr,
+	&pool_stats_attr_slab_journal_blocks_written.attr,
+	&pool_stats_attr_slab_journal_tail_busy_count.attr,
+	&pool_stats_attr_slab_summary_blocks_written.attr,
+	&pool_stats_attr_ref_counts_blocks_written.attr,
+	&pool_stats_attr_block_map_dirty_pages.attr,
+	&pool_stats_attr_block_map_clean_pages.attr,
+	&pool_stats_attr_block_map_free_pages.attr,
+	&pool_stats_attr_block_map_failed_pages.attr,
+	&pool_stats_attr_block_map_incoming_pages.attr,
+	&pool_stats_attr_block_map_outgoing_pages.attr,
+	&pool_stats_attr_block_map_cache_pressure.attr,
+	&pool_stats_attr_block_map_read_count.attr,
+	&pool_stats_attr_block_map_write_count.attr,
+	&pool_stats_attr_block_map_failed_reads.attr,
+	&pool_stats_attr_block_map_failed_writes.attr,
+	&pool_stats_attr_block_map_reclaimed.attr,
+	&pool_stats_attr_block_map_read_outgoing.attr,
+	&pool_stats_attr_block_map_found_in_cache.attr,
+	&pool_stats_attr_block_map_discard_required.attr,
+	&pool_stats_attr_block_map_wait_for_page.attr,
+	&pool_stats_attr_block_map_fetch_required.attr,
+	&pool_stats_attr_block_map_pages_loaded.attr,
+	&pool_stats_attr_block_map_pages_saved.attr,
+	&pool_stats_attr_block_map_flush_count.attr,
+	&pool_stats_attr_hash_lock_dedupe_advice_valid.attr,
+	&pool_stats_attr_hash_lock_dedupe_advice_stale.attr,
+	&pool_stats_attr_hash_lock_concurrent_data_matches.attr,
+	&pool_stats_attr_hash_lock_concurrent_hash_collisions.attr,
+	&pool_stats_attr_hash_lock_curr_dedupe_queries.attr,
+	&pool_stats_attr_errors_invalid_advice_pbn_count.attr,
+	&pool_stats_attr_errors_no_space_error_count.attr,
+	&pool_stats_attr_errors_read_only_error_count.attr,
+	&pool_stats_attr_instance.attr,
+	&pool_stats_attr_current_vios_in_progress.attr,
+	&pool_stats_attr_max_vios.attr,
+	&pool_stats_attr_dedupe_advice_timeouts.attr,
+	&pool_stats_attr_flush_out.attr,
+	&pool_stats_attr_logical_block_size.attr,
+	&pool_stats_attr_bios_in_read.attr,
+	&pool_stats_attr_bios_in_write.attr,
+	&pool_stats_attr_bios_in_empty_flush.attr,
+	&pool_stats_attr_bios_in_discard.attr,
+	&pool_stats_attr_bios_in_flush.attr,
+	&pool_stats_attr_bios_in_fua.attr,
+	&pool_stats_attr_bios_in_partial_read.attr,
+	&pool_stats_attr_bios_in_partial_write.attr,
+	&pool_stats_attr_bios_in_partial_empty_flush.attr,
+	&pool_stats_attr_bios_in_partial_discard.attr,
+	&pool_stats_attr_bios_in_partial_flush.attr,
+	&pool_stats_attr_bios_in_partial_fua.attr,
+	&pool_stats_attr_bios_out_read.attr,
+	&pool_stats_attr_bios_out_write.attr,
+	&pool_stats_attr_bios_out_empty_flush.attr,
+	&pool_stats_attr_bios_out_discard.attr,
+	&pool_stats_attr_bios_out_flush.attr,
+	&pool_stats_attr_bios_out_fua.attr,
+	&pool_stats_attr_bios_meta_read.attr,
+	&pool_stats_attr_bios_meta_write.attr,
+	&pool_stats_attr_bios_meta_empty_flush.attr,
+	&pool_stats_attr_bios_meta_discard.attr,
+	&pool_stats_attr_bios_meta_flush.attr,
+	&pool_stats_attr_bios_meta_fua.attr,
+	&pool_stats_attr_bios_journal_read.attr,
+	&pool_stats_attr_bios_journal_write.attr,
+	&pool_stats_attr_bios_journal_empty_flush.attr,
+	&pool_stats_attr_bios_journal_discard.attr,
+	&pool_stats_attr_bios_journal_flush.attr,
+	&pool_stats_attr_bios_journal_fua.attr,
+	&pool_stats_attr_bios_page_cache_read.attr,
+	&pool_stats_attr_bios_page_cache_write.attr,
+	&pool_stats_attr_bios_page_cache_empty_flush.attr,
+	&pool_stats_attr_bios_page_cache_discard.attr,
+	&pool_stats_attr_bios_page_cache_flush.attr,
+	&pool_stats_attr_bios_page_cache_fua.attr,
+	&pool_stats_attr_bios_out_completed_read.attr,
+	&pool_stats_attr_bios_out_completed_write.attr,
+	&pool_stats_attr_bios_out_completed_empty_flush.attr,
+	&pool_stats_attr_bios_out_completed_discard.attr,
+	&pool_stats_attr_bios_out_completed_flush.attr,
+	&pool_stats_attr_bios_out_completed_fua.attr,
+	&pool_stats_attr_bios_meta_completed_read.attr,
+	&pool_stats_attr_bios_meta_completed_write.attr,
+	&pool_stats_attr_bios_meta_completed_empty_flush.attr,
+	&pool_stats_attr_bios_meta_completed_discard.attr,
+	&pool_stats_attr_bios_meta_completed_flush.attr,
+	&pool_stats_attr_bios_meta_completed_fua.attr,
+	&pool_stats_attr_bios_journal_completed_read.attr,
+	&pool_stats_attr_bios_journal_completed_write.attr,
+	&pool_stats_attr_bios_journal_completed_empty_flush.attr,
+	&pool_stats_attr_bios_journal_completed_discard.attr,
+	&pool_stats_attr_bios_journal_completed_flush.attr,
+	&pool_stats_attr_bios_journal_completed_fua.attr,
+	&pool_stats_attr_bios_page_cache_completed_read.attr,
+	&pool_stats_attr_bios_page_cache_completed_write.attr,
+	&pool_stats_attr_bios_page_cache_completed_empty_flush.attr,
+	&pool_stats_attr_bios_page_cache_completed_discard.attr,
+	&pool_stats_attr_bios_page_cache_completed_flush.attr,
+	&pool_stats_attr_bios_page_cache_completed_fua.attr,
+	&pool_stats_attr_bios_acknowledged_read.attr,
+	&pool_stats_attr_bios_acknowledged_write.attr,
+	&pool_stats_attr_bios_acknowledged_empty_flush.attr,
+	&pool_stats_attr_bios_acknowledged_discard.attr,
+	&pool_stats_attr_bios_acknowledged_flush.attr,
+	&pool_stats_attr_bios_acknowledged_fua.attr,
+	&pool_stats_attr_bios_acknowledged_partial_read.attr,
+	&pool_stats_attr_bios_acknowledged_partial_write.attr,
+	&pool_stats_attr_bios_acknowledged_partial_empty_flush.attr,
+	&pool_stats_attr_bios_acknowledged_partial_discard.attr,
+	&pool_stats_attr_bios_acknowledged_partial_flush.attr,
+	&pool_stats_attr_bios_acknowledged_partial_fua.attr,
+	&pool_stats_attr_bios_in_progress_read.attr,
+	&pool_stats_attr_bios_in_progress_write.attr,
+	&pool_stats_attr_bios_in_progress_empty_flush.attr,
+	&pool_stats_attr_bios_in_progress_discard.attr,
+	&pool_stats_attr_bios_in_progress_flush.attr,
+	&pool_stats_attr_bios_in_progress_fua.attr,
+	&pool_stats_attr_memory_usage_bytes_used.attr,
+	&pool_stats_attr_memory_usage_peak_bytes_used.attr,
+	&pool_stats_attr_index_entries_indexed.attr,
+	&pool_stats_attr_index_posts_found.attr,
+	&pool_stats_attr_index_posts_not_found.attr,
+	&pool_stats_attr_index_queries_found.attr,
+	&pool_stats_attr_index_queries_not_found.attr,
+	&pool_stats_attr_index_updates_found.attr,
+	&pool_stats_attr_index_updates_not_found.attr,
+	&pool_stats_attr_index_entries_discarded.attr,
+	NULL,
+};
diff --git a/drivers/md/dm-vdo/pool-sysfs.c b/drivers/md/dm-vdo/pool-sysfs.c
new file mode 100644
index 00000000000..73006e5859d
--- /dev/null
+++ b/drivers/md/dm-vdo/pool-sysfs.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "pool-sysfs.h"
+
+#include <linux/kstrtox.h>
+
+#include "memory-alloc.h"
+#include "string-utils.h"
+
+#include "data-vio.h"
+#include "dedupe.h"
+#include "vdo.h"
+
+struct pool_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct vdo *vdo, char *buf);
+	ssize_t (*store)(struct vdo *vdo, const char *value, size_t count);
+};
+
+static ssize_t vdo_pool_attr_show(struct kobject *directory, struct attribute *attr, char *buf)
+{
+	struct pool_attribute *pool_attr = container_of(attr, struct pool_attribute, attr);
+	struct vdo *vdo = container_of(directory, struct vdo, vdo_directory);
+
+	if (pool_attr->show == NULL)
+		return -EINVAL;
+	return pool_attr->show(vdo, buf);
+}
+
+static ssize_t vdo_pool_attr_store(struct kobject *directory,
+				   struct attribute *attr,
+				   const char *buf,
+				   size_t length)
+{
+	struct pool_attribute *pool_attr = container_of(attr, struct pool_attribute, attr);
+	struct vdo *vdo = container_of(directory, struct vdo, vdo_directory);
+
+	if (pool_attr->store == NULL)
+		return -EINVAL;
+	return pool_attr->store(vdo, buf, length);
+}
+
+static const struct sysfs_ops vdo_pool_sysfs_ops = {
+	.show = vdo_pool_attr_show,
+	.store = vdo_pool_attr_store,
+};
+
+static ssize_t pool_compressing_show(struct vdo *vdo, char *buf)
+{
+	return sprintf(buf, "%s\n", (vdo_get_compressing(vdo) ? "1" : "0"));
+}
+
+static ssize_t pool_discards_active_show(struct vdo *vdo, char *buf)
+{
+	return sprintf(buf, "%u\n", get_data_vio_pool_active_discards(vdo->data_vio_pool));
+}
+
+static ssize_t pool_discards_limit_show(struct vdo *vdo, char *buf)
+{
+	return sprintf(buf, "%u\n", get_data_vio_pool_discard_limit(vdo->data_vio_pool));
+}
+
+static ssize_t pool_discards_limit_store(struct vdo *vdo, const char *buf, size_t length)
+{
+	unsigned int value;
+	int result;
+
+	if ((length > 12) || (kstrtouint(buf, 10, &value) < 0) || (value < 1))
+		return -EINVAL;
+
+	result = set_data_vio_pool_discard_limit(vdo->data_vio_pool, value);
+	if (result != VDO_SUCCESS)
+		return -EINVAL;
+
+	return length;
+}
+
+static ssize_t pool_discards_maximum_show(struct vdo *vdo, char *buf)
+{
+	return sprintf(buf, "%u\n", get_data_vio_pool_maximum_discards(vdo->data_vio_pool));
+}
+
+static ssize_t pool_instance_show(struct vdo *vdo, char *buf)
+{
+	return sprintf(buf, "%u\n", vdo->instance);
+}
+
+static ssize_t pool_requests_active_show(struct vdo *vdo, char *buf)
+{
+	return sprintf(buf, "%u\n", get_data_vio_pool_active_requests(vdo->data_vio_pool));
+}
+
+static ssize_t pool_requests_limit_show(struct vdo *vdo, char *buf)
+{
+	return sprintf(buf, "%u\n", get_data_vio_pool_request_limit(vdo->data_vio_pool));
+}
+
+static ssize_t pool_requests_maximum_show(struct vdo *vdo, char *buf)
+{
+	return sprintf(buf, "%u\n", get_data_vio_pool_maximum_requests(vdo->data_vio_pool));
+}
+
+static void vdo_pool_release(struct kobject *directory)
+{
+	UDS_FREE(container_of(directory, struct vdo, vdo_directory));
+}
+
+static struct pool_attribute vdo_pool_compressing_attr = {
+	.attr = {
+			.name = "compressing",
+			.mode = 0444,
+		},
+	.show = pool_compressing_show,
+};
+
+static struct pool_attribute vdo_pool_discards_active_attr = {
+	.attr = {
+			.name = "discards_active",
+			.mode = 0444,
+		},
+	.show = pool_discards_active_show,
+};
+
+static struct pool_attribute vdo_pool_discards_limit_attr = {
+	.attr = {
+			.name = "discards_limit",
+			.mode = 0644,
+		},
+	.show = pool_discards_limit_show,
+	.store = pool_discards_limit_store,
+};
+
+static struct pool_attribute vdo_pool_discards_maximum_attr = {
+	.attr = {
+			.name = "discards_maximum",
+			.mode = 0444,
+		},
+	.show = pool_discards_maximum_show,
+};
+
+static struct pool_attribute vdo_pool_instance_attr = {
+	.attr = {
+			.name = "instance",
+			.mode = 0444,
+		},
+	.show = pool_instance_show,
+};
+
+static struct pool_attribute vdo_pool_requests_active_attr = {
+	.attr = {
+			.name = "requests_active",
+			.mode = 0444,
+		},
+	.show = pool_requests_active_show,
+};
+
+static struct pool_attribute vdo_pool_requests_limit_attr = {
+	.attr = {
+			.name = "requests_limit",
+			.mode = 0444,
+		},
+	.show = pool_requests_limit_show,
+};
+
+static struct pool_attribute vdo_pool_requests_maximum_attr = {
+	.attr = {
+			.name = "requests_maximum",
+			.mode = 0444,
+		},
+	.show = pool_requests_maximum_show,
+};
+
+static struct attribute *pool_attrs[] = {
+	&vdo_pool_compressing_attr.attr,
+	&vdo_pool_discards_active_attr.attr,
+	&vdo_pool_discards_limit_attr.attr,
+	&vdo_pool_discards_maximum_attr.attr,
+	&vdo_pool_instance_attr.attr,
+	&vdo_pool_requests_active_attr.attr,
+	&vdo_pool_requests_limit_attr.attr,
+	&vdo_pool_requests_maximum_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(pool);
+
+struct kobj_type vdo_directory_type = {
+	.release = vdo_pool_release,
+	.sysfs_ops = &vdo_pool_sysfs_ops,
+	.default_groups = pool_groups,
+};
diff --git a/drivers/md/dm-vdo/pool-sysfs.h b/drivers/md/dm-vdo/pool-sysfs.h
new file mode 100644
index 00000000000..1e8a172c367
--- /dev/null
+++ b/drivers/md/dm-vdo/pool-sysfs.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_POOL_SYSFS_H
+#define VDO_POOL_SYSFS_H
+
+#include <linux/kobject.h>
+
+/* The kobj_type used for setting up the kernel layer kobject. */
+extern struct kobj_type vdo_directory_type;
+
+/* The sysfs_ops used for the "statistics" subdirectory. */
+extern const struct sysfs_ops vdo_pool_stats_sysfs_ops;
+/* The attribute used for the "statistics" subdirectory. */
+extern struct attribute *vdo_pool_stats_attrs[];
+
+#endif /* VDO_POOL_SYSFS_H */
diff --git a/drivers/md/dm-vdo/priority-table.c b/drivers/md/dm-vdo/priority-table.c
new file mode 100644
index 00000000000..d0fb949af87
--- /dev/null
+++ b/drivers/md/dm-vdo/priority-table.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "priority-table.h"
+
+#include <linux/log2.h>
+
+#include "errors.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "status-codes.h"
+
+/* We use a single 64-bit search vector, so the maximum priority is 63 */
+enum {
+	MAX_PRIORITY = 63
+};
+
+/*
+ * All the entries with the same priority are queued in a circular list in a bucket for that
+ * priority. The table is essentially an array of buckets.
+ */
+struct bucket {
+	/*
+	 * The head of a queue of table entries, all having the same priority
+	 */
+	struct list_head queue;
+	/* The priority of all the entries in this bucket */
+	unsigned int priority;
+};
+
+/*
+ * A priority table is an array of buckets, indexed by priority. New entries are added to the end
+ * of the queue in the appropriate bucket. The dequeue operation finds the highest-priority
+ * non-empty bucket by searching a bit vector represented as a single 8-byte word, which is very
+ * fast with compiler and CPU support.
+ */
+struct priority_table {
+	/* The maximum priority of entries that may be stored in this table */
+	unsigned int max_priority;
+	/* A bit vector flagging all buckets that are currently non-empty */
+	u64 search_vector;
+	/* The array of all buckets, indexed by priority */
+	struct bucket buckets[];
+};
+
+/**
+ * vdo_make_priority_table() - Allocate and initialize a new priority_table.
+ * @max_priority: The maximum priority value for table entries.
+ * @table_ptr: A pointer to hold the new table.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_make_priority_table(unsigned int max_priority, struct priority_table **table_ptr)
+{
+	struct priority_table *table;
+	int result;
+	unsigned int priority;
+
+	if (max_priority > MAX_PRIORITY)
+		return UDS_INVALID_ARGUMENT;
+
+	result = UDS_ALLOCATE_EXTENDED(struct priority_table, max_priority + 1,
+				       struct bucket, __func__, &table);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	for (priority = 0; priority <= max_priority; priority++) {
+		struct bucket *bucket = &table->buckets[priority];
+
+		bucket->priority = priority;
+		INIT_LIST_HEAD(&bucket->queue);
+	}
+
+	table->max_priority = max_priority;
+	table->search_vector = 0;
+
+	*table_ptr = table;
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_free_priority_table() - Free a priority_table.
+ * @table: The table to free.
+ *
+ * The table does not own the entries stored in it and they are not freed by this call.
+ */
+void vdo_free_priority_table(struct priority_table *table)
+{
+	if (table == NULL)
+		return;
+
+	/*
+	 * Unlink the buckets from any entries still in the table so the entries won't be left with
+	 * dangling pointers to freed memory.
+	 */
+	vdo_reset_priority_table(table);
+
+	UDS_FREE(table);
+}
+
+/**
+ * vdo_reset_priority_table() - Reset a priority table, leaving it in the same empty state as when
+ *                          newly constructed.
+ * @table: The table to reset.
+ *
+ * The table does not own the entries stored in it and they are not freed (or even unlinked from
+ * each other) by this call.
+ */
+void vdo_reset_priority_table(struct priority_table *table)
+{
+	unsigned int priority;
+
+	table->search_vector = 0;
+	for (priority = 0; priority <= table->max_priority; priority++)
+		list_del_init(&table->buckets[priority].queue);
+}
+
+/**
+ * vdo_priority_table_enqueue() - Add a new entry to the priority table, appending it to the queue
+ *                                for entries with the specified priority.
+ * @table: The table in which to store the entry.
+ * @priority: The priority of the entry.
+ * @entry: The list_head embedded in the entry to store in the table (the caller must have
+ *         initialized it).
+ */
+void vdo_priority_table_enqueue(struct priority_table *table,
+				unsigned int priority,
+				struct list_head *entry)
+{
+	ASSERT_LOG_ONLY((priority <= table->max_priority),
+			"entry priority must be valid for the table");
+
+	/* Append the entry to the queue in the specified bucket. */
+	list_move_tail(entry, &table->buckets[priority].queue);
+
+	/* Flag the bucket in the search vector since it must be non-empty. */
+	table->search_vector |= (1ULL << priority);
+}
+
+static inline void mark_bucket_empty(struct priority_table *table, struct bucket *bucket)
+{
+	table->search_vector &= ~(1ULL << bucket->priority);
+}
+
+/**
+ * vdo_priority_table_dequeue() - Find the highest-priority entry in the table, remove it from the
+ *                                table, and return it.
+ * @table: The priority table from which to remove an entry.
+ *
+ * If there are multiple entries with the same priority, the one that has been in the table with
+ * that priority the longest will be returned.
+ *
+ * Return: The dequeued entry, or NULL if the table is currently empty.
+ */
+struct list_head *vdo_priority_table_dequeue(struct priority_table *table)
+{
+	struct bucket *bucket;
+	struct list_head *entry;
+	int top_priority;
+
+	if (table->search_vector == 0)
+		/* All buckets are empty. */
+		return NULL;
+
+	/*
+	 * Find the highest priority non-empty bucket by finding the highest-order non-zero bit in
+	 * the search vector.
+	 */
+	top_priority = ilog2(table->search_vector);
+
+	/* Dequeue the first entry in the bucket. */
+	bucket = &table->buckets[top_priority];
+	entry = bucket->queue.next;
+	list_del_init(entry);
+
+	/* Clear the bit in the search vector if the bucket has been emptied. */
+	if (list_empty(&bucket->queue))
+		mark_bucket_empty(table, bucket);
+
+	return entry;
+}
+
+/**
+ * vdo_priority_table_remove() - Remove a specified entry from its priority table.
+ * @table: The table from which to remove the entry.
+ * @entry: The entry to remove from the table.
+ */
+void vdo_priority_table_remove(struct priority_table *table, struct list_head *entry)
+{
+	struct list_head *next_entry;
+
+	/*
+	 * We can't guard against calls where the entry is on a list for a different table, but
+	 * it's easy to deal with an entry not in any table or list.
+	 */
+	if (list_empty(entry))
+		return;
+
+	/*
+	 * Remove the entry from the bucket list, remembering a pointer to another entry in the
+	 * ring.
+	 */
+	next_entry = entry->next;
+	list_del_init(entry);
+
+	/*
+	 * If the rest of the list is now empty, the next node must be the list head in the bucket
+	 * and we can use it to update the search vector.
+	 */
+	if (list_empty(next_entry))
+		mark_bucket_empty(table, list_entry(next_entry, struct bucket, queue));
+}
+
+/**
+ * vdo_is_priority_table_empty() - Return whether the priority table is empty.
+ * @table: The table to check.
+ *
+ * Return: true if the table is empty.
+ */
+bool vdo_is_priority_table_empty(struct priority_table *table)
+{
+	return (table->search_vector == 0);
+}
diff --git a/drivers/md/dm-vdo/priority-table.h b/drivers/md/dm-vdo/priority-table.h
new file mode 100644
index 00000000000..7c5f689dc2a
--- /dev/null
+++ b/drivers/md/dm-vdo/priority-table.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_PRIORITY_TABLE_H
+#define VDO_PRIORITY_TABLE_H
+
+#include <linux/list.h>
+
+/*
+ * A priority_table is a simple implementation of a priority queue for entries with priorities that
+ * are small non-negative integer values. It implements the obvious priority queue operations of
+ * enqueuing an entry and dequeuing an entry with the maximum priority. It also supports removing
+ * an arbitrary entry. The priority of an entry already in the table can be changed by removing it
+ * and re-enqueuing it with a different priority. All operations have O(1) complexity.
+ *
+ * The links for the table entries must be embedded in the entries themselves. Lists are used to
+ * link entries in the table and no wrapper type is declared, so an existing list entry in an
+ * object can also be used to queue it in a priority_table, assuming the field is not used for
+ * anything else while so queued.
+ *
+ * The table is implemented as an array of queues (circular lists) indexed by priority, along with
+ * a hint for which queues are non-empty. Steven Skiena calls a very similar structure a "bounded
+ * height priority queue", but given the resemblance to a hash table, "priority table" seems both
+ * shorter and more apt, if somewhat novel.
+ */
+
+struct priority_table;
+
+int __must_check
+vdo_make_priority_table(unsigned int max_priority, struct priority_table **table_ptr);
+
+void vdo_free_priority_table(struct priority_table *table);
+
+void vdo_priority_table_enqueue(struct priority_table *table,
+				unsigned int priority,
+				struct list_head *entry);
+
+void vdo_reset_priority_table(struct priority_table *table);
+
+struct list_head * __must_check vdo_priority_table_dequeue(struct priority_table *table);
+
+void vdo_priority_table_remove(struct priority_table *table, struct list_head *entry);
+
+bool __must_check vdo_is_priority_table_empty(struct priority_table *table);
+
+#endif /* VDO_PRIORITY_TABLE_H */
diff --git a/drivers/md/dm-vdo/recovery-journal.c b/drivers/md/dm-vdo/recovery-journal.c
new file mode 100644
index 00000000000..09db9a9dffa
--- /dev/null
+++ b/drivers/md/dm-vdo/recovery-journal.c
@@ -0,0 +1,1772 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "recovery-journal.h"
+
+#include <linux/atomic.h>
+#include <linux/bio.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "block-map.h"
+#include "completion.h"
+#include "constants.h"
+#include "data-vio.h"
+#include "encodings.h"
+#include "io-submitter.h"
+#include "slab-depot.h"
+#include "types.h"
+#include "vdo.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+static const u64 RECOVERY_COUNT_MASK = 0xff;
+
+enum {
+	/*
+	 * The number of reserved blocks must be large enough to prevent a new recovery journal
+	 * block write from overwriting a block which appears to still be a valid head block of the
+	 * journal. Currently, that means reserving enough space for all 2048 data_vios.
+	 */
+	RECOVERY_JOURNAL_RESERVED_BLOCKS =
+		(MAXIMUM_VDO_USER_VIOS / RECOVERY_JOURNAL_ENTRIES_PER_BLOCK) + 2,
+	WRITE_FLAGS = REQ_OP_WRITE | REQ_PRIO | REQ_PREFLUSH | REQ_SYNC | REQ_FUA,
+};
+
+/**
+ * DOC: Lock Counters.
+ *
+ * A lock_counter is intended to keep all of the locks for the blocks in the recovery journal. The
+ * per-zone counters are all kept in a single array which is arranged by zone (i.e. zone 0's lock 0
+ * is at index 0, zone 0's lock 1 is at index 1, and zone 1's lock 0 is at index 'locks'. This
+ * arrangement is intended to minimize cache-line contention for counters from different zones.
+ *
+ * The locks are implemented as a single object instead of as a lock counter per lock both to
+ * afford this opportunity to reduce cache line contention and also to eliminate the need to have a
+ * completion per lock.
+ *
+ * Lock sets are laid out with the set for recovery journal first, followed by the logical zones,
+ * and then the physical zones.
+ */
+
+enum lock_counter_state {
+	LOCK_COUNTER_STATE_NOT_NOTIFYING,
+	LOCK_COUNTER_STATE_NOTIFYING,
+	LOCK_COUNTER_STATE_SUSPENDED,
+};
+
+/**
+ * get_zone_count_ptr() - Get a pointer to the zone count for a given lock on a given zone.
+ * @journal: The recovery journal.
+ * @lock_number: The lock to get.
+ * @zone_type: The zone type whose count is desired.
+ *
+ * Return: A pointer to the zone count for the given lock and zone.
+ */
+static inline atomic_t *get_zone_count_ptr(struct recovery_journal *journal,
+					   block_count_t lock_number,
+					   enum vdo_zone_type zone_type)
+{
+	return ((zone_type == VDO_ZONE_TYPE_LOGICAL)
+		? &journal->lock_counter.logical_zone_counts[lock_number]
+		: &journal->lock_counter.physical_zone_counts[lock_number]);
+}
+
+/**
+ * get_counter() - Get the zone counter for a given lock on a given zone.
+ * @journal: The recovery journal.
+ * @lock_number: The lock to get.
+ * @zone_type: The zone type whose count is desired.
+ * @zone_id: The zone index whose count is desired.
+ *
+ * Return: The counter for the given lock and zone.
+ */
+static inline u16 *get_counter(struct recovery_journal *journal,
+			       block_count_t lock_number,
+			       enum vdo_zone_type zone_type,
+			       zone_count_t zone_id)
+{
+	struct lock_counter *counter = &journal->lock_counter;
+	block_count_t zone_counter = (counter->locks * zone_id) + lock_number;
+
+	if (zone_type == VDO_ZONE_TYPE_JOURNAL)
+		return &counter->journal_counters[zone_counter];
+
+	if (zone_type == VDO_ZONE_TYPE_LOGICAL)
+		return &counter->logical_counters[zone_counter];
+
+	return &counter->physical_counters[zone_counter];
+}
+
+static atomic_t *get_decrement_counter(struct recovery_journal *journal, block_count_t lock_number)
+{
+	return &journal->lock_counter.journal_decrement_counts[lock_number];
+}
+
+/**
+ * is_journal_zone_locked() - Check whether the journal zone is locked for a given lock.
+ * @journal: The recovery journal.
+ * @lock_number: The lock to check.
+ *
+ * Return: true if the journal zone is locked.
+ */
+static bool is_journal_zone_locked(struct recovery_journal *journal, block_count_t lock_number)
+{
+	u16 journal_value = *(get_counter(journal, lock_number, VDO_ZONE_TYPE_JOURNAL, 0));
+	u32 decrements = atomic_read(get_decrement_counter(journal, lock_number));
+
+	/* Pairs with barrier in vdo_release_journal_entry_lock() */
+	smp_rmb();
+	ASSERT_LOG_ONLY((decrements <= journal_value),
+			"journal zone lock counter must not underflow");
+	return (journal_value != decrements);
+}
+
+/**
+ * vdo_release_recovery_journal_block_reference() - Release a reference to a recovery journal
+ *                                                  block.
+ * @journal: The recovery journal.
+ * @sequence_number: The journal sequence number of the referenced block.
+ * @zone_type: The type of the zone making the adjustment.
+ * @zone_id: The ID of the zone making the adjustment.
+ *
+ * If this is the last reference for a given zone type, an attempt will be made to reap the
+ * journal.
+ */
+void vdo_release_recovery_journal_block_reference(struct recovery_journal *journal,
+						  sequence_number_t sequence_number,
+						  enum vdo_zone_type zone_type,
+						  zone_count_t zone_id)
+{
+	u16 *current_value;
+	block_count_t lock_number;
+	int prior_state;
+
+	if (sequence_number == 0)
+		return;
+
+	lock_number = vdo_get_recovery_journal_block_number(journal, sequence_number);
+	current_value = get_counter(journal, lock_number, zone_type, zone_id);
+
+	ASSERT_LOG_ONLY((*current_value >= 1), "decrement of lock counter must not underflow");
+	*current_value -= 1;
+
+	if (zone_type == VDO_ZONE_TYPE_JOURNAL) {
+		if (is_journal_zone_locked(journal, lock_number))
+			return;
+	} else {
+		atomic_t *zone_count;
+
+		if (*current_value != 0)
+			return;
+
+		zone_count = get_zone_count_ptr(journal, lock_number, zone_type);
+
+		if (atomic_add_return(-1, zone_count) > 0)
+			return;
+	}
+
+	/*
+	 * Extra barriers because this was original developed using a CAS operation that implicitly
+	 * had them.
+	 */
+	smp_mb__before_atomic();
+	prior_state = atomic_cmpxchg(&journal->lock_counter.state,
+				     LOCK_COUNTER_STATE_NOT_NOTIFYING,
+				     LOCK_COUNTER_STATE_NOTIFYING);
+	/* same as before_atomic */
+	smp_mb__after_atomic();
+
+	if (prior_state != LOCK_COUNTER_STATE_NOT_NOTIFYING)
+		return;
+
+	vdo_launch_completion(&journal->lock_counter.completion);
+}
+
+static inline struct recovery_journal_block * __must_check
+get_journal_block(struct list_head *list)
+{
+	return list_first_entry_or_null(list, struct recovery_journal_block, list_node);
+}
+
+/**
+ * pop_free_list() - Get a block from the end of the free list.
+ * @journal: The journal.
+ *
+ * Return: The block or NULL if the list is empty.
+ */
+static struct recovery_journal_block * __must_check
+pop_free_list(struct recovery_journal *journal)
+{
+	struct recovery_journal_block *block;
+
+	if (list_empty(&journal->free_tail_blocks))
+		return NULL;
+
+	block = list_last_entry(&journal->free_tail_blocks,
+				struct recovery_journal_block,
+				list_node);
+	list_del_init(&block->list_node);
+	return block;
+}
+
+/**
+ * is_block_dirty() - Check whether a recovery block is dirty.
+ * @block: The block to check.
+ *
+ * Indicates it has any uncommitted entries, which includes both entries not written and entries
+ * written but not yet acknowledged.
+ *
+ * Return: true if the block has any uncommitted entries.
+ */
+static inline bool __must_check is_block_dirty(const struct recovery_journal_block *block)
+{
+	return (block->uncommitted_entry_count > 0);
+}
+
+/**
+ * is_block_empty() - Check whether a journal block is empty.
+ * @block: The block to check.
+ *
+ * Return: true if the block has no entries.
+ */
+static inline bool __must_check is_block_empty(const struct recovery_journal_block *block)
+{
+	return (block->entry_count == 0);
+}
+
+/**
+ * is_block_full() - Check whether a journal block is full.
+ * @block: The block to check.
+ *
+ * Return: true if the block is full.
+ */
+static inline bool __must_check is_block_full(const struct recovery_journal_block *block)
+{
+	return ((block == NULL) || (block->journal->entries_per_block == block->entry_count));
+}
+
+/**
+ * assert_on_journal_thread() - Assert that we are running on the journal thread.
+ * @journal: The journal.
+ * @function_name: The function doing the check (for logging).
+ */
+static void assert_on_journal_thread(struct recovery_journal *journal, const char *function_name)
+{
+	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == journal->thread_id),
+			"%s() called on journal thread", function_name);
+}
+
+/**
+ * continue_waiter() - Release a data_vio from the journal.
+ *
+ * Invoked whenever a data_vio is to be released from the journal, either because its entry was
+ * committed to disk, or because there was an error. Implements waiter_callback.
+ */
+static void continue_waiter(struct waiter *waiter, void *context)
+{
+	continue_data_vio_with_error(waiter_as_data_vio(waiter), *((int *) context));
+}
+
+/**
+ * has_block_waiters() - Check whether the journal has any waiters on any blocks.
+ * @journal: The journal in question.
+ *
+ * Return: true if any block has a waiter.
+ */
+static inline bool has_block_waiters(struct recovery_journal *journal)
+{
+	struct recovery_journal_block *block = get_journal_block(&journal->active_tail_blocks);
+
+	/*
+	 * Either the first active tail block (if it exists) has waiters, or no active tail block
+	 * has waiters.
+	 */
+	return ((block != NULL) &&
+		(vdo_has_waiters(&block->entry_waiters) ||
+		 vdo_has_waiters(&block->commit_waiters)));
+}
+
+static void recycle_journal_blocks(struct recovery_journal *journal);
+static void recycle_journal_block(struct recovery_journal_block *block);
+static void notify_commit_waiters(struct recovery_journal *journal);
+
+/**
+ * suspend_lock_counter() - Prevent the lock counter from notifying.
+ * @counter: The counter.
+ *
+ * Return: true if the lock counter was not notifying and hence the suspend was efficacious.
+ */
+static bool suspend_lock_counter(struct lock_counter *counter)
+{
+	int prior_state;
+
+	/*
+	 * Extra barriers because this was originally developed using a CAS operation that
+	 * implicitly had them.
+	 */
+	smp_mb__before_atomic();
+	prior_state = atomic_cmpxchg(&counter->state,
+				     LOCK_COUNTER_STATE_NOT_NOTIFYING,
+				     LOCK_COUNTER_STATE_SUSPENDED);
+	/* same as before_atomic */
+	smp_mb__after_atomic();
+
+	return ((prior_state == LOCK_COUNTER_STATE_SUSPENDED) ||
+		(prior_state == LOCK_COUNTER_STATE_NOT_NOTIFYING));
+}
+
+static inline bool is_read_only(struct recovery_journal *journal)
+{
+	return vdo_is_read_only(journal->flush_vio->completion.vdo);
+}
+
+/**
+ * check_for_drain_complete() - Check whether the journal has drained.
+ * @journal: The journal which may have just drained.
+ */
+static void check_for_drain_complete(struct recovery_journal *journal)
+{
+	int result = VDO_SUCCESS;
+
+	if (is_read_only(journal)) {
+		result = VDO_READ_ONLY;
+		/*
+		 * Clean up any full active blocks which were not written due to read-only mode.
+		 *
+		 * FIXME: This would probably be better as a short-circuit in write_block().
+		 */
+		notify_commit_waiters(journal);
+		recycle_journal_blocks(journal);
+
+		/* Release any data_vios waiting to be assigned entries. */
+		vdo_notify_all_waiters(&journal->entry_waiters, continue_waiter, &result);
+	}
+
+	if (!vdo_is_state_draining(&journal->state) ||
+	    journal->reaping ||
+	    has_block_waiters(journal) ||
+	    vdo_has_waiters(&journal->entry_waiters) ||
+	    !suspend_lock_counter(&journal->lock_counter))
+		return;
+
+	if (vdo_is_state_saving(&journal->state)) {
+		if (journal->active_block != NULL) {
+			ASSERT_LOG_ONLY(((result == VDO_READ_ONLY) ||
+					 !is_block_dirty(journal->active_block)),
+					"journal being saved has clean active block");
+			recycle_journal_block(journal->active_block);
+		}
+
+		ASSERT_LOG_ONLY(list_empty(&journal->active_tail_blocks),
+				"all blocks in a journal being saved must be inactive");
+	}
+
+	vdo_finish_draining_with_result(&journal->state, result);
+}
+
+/**
+ * notify_recovery_journal_of_read_only_mode() - Notify a recovery journal that the VDO has gone
+ *                                               read-only.
+ * @listener: The journal.
+ * @parent: The completion to notify in order to acknowledge the notification.
+ *
+ * Implements vdo_read_only_notification.
+ */
+static void
+notify_recovery_journal_of_read_only_mode(void *listener, struct vdo_completion *parent)
+{
+	check_for_drain_complete(listener);
+	vdo_finish_completion(parent);
+}
+
+/**
+ * enter_journal_read_only_mode() - Put the journal in read-only mode.
+ * @journal: The journal which has failed.
+ * @error_code: The error result triggering this call.
+ *
+ * All attempts to add entries after this function is called will fail. All VIOs waiting for
+ * commits will be awakened with an error.
+ */
+static void enter_journal_read_only_mode(struct recovery_journal *journal, int error_code)
+{
+	vdo_enter_read_only_mode(journal->flush_vio->completion.vdo, error_code);
+	check_for_drain_complete(journal);
+}
+
+/**
+ * vdo_get_recovery_journal_current_sequence_number() - Obtain the recovery journal's current
+ *                                                      sequence number.
+ * @journal: The journal in question.
+ *
+ * Exposed only so the block map can be initialized therefrom.
+ *
+ * Return: The sequence number of the tail block.
+ */
+sequence_number_t
+vdo_get_recovery_journal_current_sequence_number(struct recovery_journal *journal)
+{
+	return journal->tail;
+}
+
+/**
+ * get_recovery_journal_head() - Get the head of the recovery journal.
+ * @journal: The journal.
+ *
+ * The head is the lowest sequence number of the block map head and the slab journal head.
+ *
+ * Return: the head of the journal.
+ */
+static inline sequence_number_t get_recovery_journal_head(const struct recovery_journal *journal)
+{
+	return min(journal->block_map_head, journal->slab_journal_head);
+}
+
+/**
+ * compute_recovery_count_byte() - Compute the recovery count byte for a given recovery count.
+ * @recovery_count: The recovery count.
+ *
+ * Return: The byte corresponding to the recovery count.
+ */
+static inline u8 __must_check compute_recovery_count_byte(u64 recovery_count)
+{
+	return (u8)(recovery_count & RECOVERY_COUNT_MASK);
+}
+
+/**
+ * check_slab_journal_commit_threshold() - Check whether the journal is over the threshold, and if
+ *                                         so, force the oldest slab journal tail block to commit.
+ * @journal: The journal.
+ */
+static void check_slab_journal_commit_threshold(struct recovery_journal *journal)
+{
+	block_count_t current_length = journal->tail - journal->slab_journal_head;
+
+	if (current_length > journal->slab_journal_commit_threshold) {
+		journal->events.slab_journal_commits_requested++;
+		vdo_commit_oldest_slab_journal_tail_blocks(journal->depot,
+							   journal->slab_journal_head);
+	}
+}
+
+static void reap_recovery_journal(struct recovery_journal *journal);
+static void assign_entries(struct recovery_journal *journal);
+
+/**
+ * finish_reaping() - Finish reaping the journal.
+ * @journal: The journal being reaped.
+ */
+static void finish_reaping(struct recovery_journal *journal)
+{
+	block_count_t blocks_reaped;
+	sequence_number_t old_head = get_recovery_journal_head(journal);
+
+	journal->block_map_head = journal->block_map_reap_head;
+	journal->slab_journal_head = journal->slab_journal_reap_head;
+	blocks_reaped = get_recovery_journal_head(journal) - old_head;
+	journal->available_space += blocks_reaped * journal->entries_per_block;
+	journal->reaping = false;
+	check_slab_journal_commit_threshold(journal);
+	assign_entries(journal);
+	check_for_drain_complete(journal);
+}
+
+/**
+ * complete_reaping() - Finish reaping the journal after flushing the lower layer.
+ * @completion: The journal's flush VIO.
+ *
+ * This is the callback registered in reap_recovery_journal().
+ */
+static void complete_reaping(struct vdo_completion *completion)
+{
+	struct recovery_journal *journal = completion->parent;
+
+	finish_reaping(journal);
+
+	/* Try reaping again in case more locks were released while flush was out. */
+	reap_recovery_journal(journal);
+}
+
+/**
+ * handle_flush_error() - Handle an error when flushing the lower layer due to reaping.
+ * @completion: The journal's flush VIO.
+ */
+static void handle_flush_error(struct vdo_completion *completion)
+{
+	struct recovery_journal *journal = completion->parent;
+
+	vio_record_metadata_io_error(as_vio(completion));
+	journal->reaping = false;
+	enter_journal_read_only_mode(journal, completion->result);
+}
+
+static void flush_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct recovery_journal *journal = vio->completion.parent;
+
+	continue_vio_after_io(vio, complete_reaping, journal->thread_id);
+}
+
+/**
+ * initialize_journal_state() - Set all journal fields appropriately to start journaling from the
+ *                              current active block.
+ * @journal: The journal to be reset based on its active block.
+ */
+static void initialize_journal_state(struct recovery_journal *journal)
+{
+	journal->append_point.sequence_number = journal->tail;
+	journal->last_write_acknowledged = journal->tail;
+	journal->block_map_head = journal->tail;
+	journal->slab_journal_head = journal->tail;
+	journal->block_map_reap_head = journal->tail;
+	journal->slab_journal_reap_head = journal->tail;
+	journal->block_map_head_block_number =
+		vdo_get_recovery_journal_block_number(journal, journal->block_map_head);
+	journal->slab_journal_head_block_number =
+		vdo_get_recovery_journal_block_number(journal, journal->slab_journal_head);
+	journal->available_space =
+		(journal->entries_per_block * vdo_get_recovery_journal_length(journal->size));
+}
+
+/**
+ * vdo_get_recovery_journal_length() - Get the number of usable recovery journal blocks.
+ * @journal_size: The size of the recovery journal in blocks.
+ *
+ * Return: the number of recovery journal blocks usable for entries.
+ */
+block_count_t vdo_get_recovery_journal_length(block_count_t journal_size)
+{
+	block_count_t reserved_blocks = journal_size / 4;
+
+	if (reserved_blocks > RECOVERY_JOURNAL_RESERVED_BLOCKS)
+		reserved_blocks = RECOVERY_JOURNAL_RESERVED_BLOCKS;
+	return (journal_size - reserved_blocks);
+}
+
+/**
+ * reap_recovery_journal_callback() - Attempt to reap the journal.
+ * @completion: The lock counter completion.
+ *
+ * Attempts to reap the journal now that all the locks on some journal block have been released.
+ * This is the callback registered with the lock counter.
+ */
+static void reap_recovery_journal_callback(struct vdo_completion *completion)
+{
+	struct recovery_journal *journal = (struct recovery_journal *) completion->parent;
+	/*
+	 * The acknowledgement must be done before reaping so that there is no race between
+	 * acknowledging the notification and unlocks wishing to notify.
+	 */
+	smp_wmb();
+	atomic_set(&journal->lock_counter.state, LOCK_COUNTER_STATE_NOT_NOTIFYING);
+
+	if (vdo_is_state_quiescing(&journal->state)) {
+		/*
+		 * Don't start reaping when the journal is trying to quiesce. Do check if this
+		 * notification is the last thing the is waiting on.
+		 */
+		check_for_drain_complete(journal);
+		return;
+	}
+
+	reap_recovery_journal(journal);
+	check_slab_journal_commit_threshold(journal);
+}
+
+/**
+ * initialize_lock_counter() - Initialize a lock counter.
+ *
+ * @journal: The recovery journal.
+ * @vdo: The vdo.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int __must_check initialize_lock_counter(struct recovery_journal *journal, struct vdo *vdo)
+{
+	int result;
+	struct thread_config *config = &vdo->thread_config;
+	struct lock_counter *counter = &journal->lock_counter;
+
+	result = UDS_ALLOCATE(journal->size, u16, __func__, &counter->journal_counters);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = UDS_ALLOCATE(journal->size,
+			      atomic_t,
+			      __func__,
+			      &counter->journal_decrement_counts);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = UDS_ALLOCATE(journal->size * config->logical_zone_count,
+			      u16,
+			      __func__,
+			      &counter->logical_counters);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = UDS_ALLOCATE(journal->size, atomic_t, __func__, &counter->logical_zone_counts);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = UDS_ALLOCATE(journal->size * config->physical_zone_count,
+			      u16,
+			      __func__,
+			      &counter->physical_counters);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = UDS_ALLOCATE(journal->size, atomic_t, __func__, &counter->physical_zone_counts);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	vdo_initialize_completion(&counter->completion, vdo, VDO_LOCK_COUNTER_COMPLETION);
+	vdo_prepare_completion(&counter->completion,
+			       reap_recovery_journal_callback,
+			       reap_recovery_journal_callback,
+			       config->journal_thread,
+			       journal);
+	counter->logical_zones = config->logical_zone_count;
+	counter->physical_zones = config->physical_zone_count;
+	counter->locks = journal->size;
+	return VDO_SUCCESS;
+}
+
+/**
+ * set_journal_tail() - Set the journal's tail sequence number.
+ * @journal: The journal whose tail is to be set.
+ * @tail: The new tail value.
+ */
+static void set_journal_tail(struct recovery_journal *journal, sequence_number_t tail)
+{
+	/* VDO does not support sequence numbers above 1 << 48 in the slab journal. */
+	if (tail >= (1ULL << 48))
+		enter_journal_read_only_mode(journal, VDO_JOURNAL_OVERFLOW);
+
+	journal->tail = tail;
+}
+
+/**
+ * initialize_recovery_block() - Initialize a journal block.
+ * @vdo: The vdo from which to construct vios.
+ * @journal: The journal to which the block will belong.
+ * @block: The block to initialize.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int initialize_recovery_block(struct vdo *vdo,
+				     struct recovery_journal *journal,
+				     struct recovery_journal_block *block)
+{
+	char *data;
+	int result;
+
+	/*
+	 * Ensure that a block is large enough to store RECOVERY_JOURNAL_ENTRIES_PER_BLOCK entries.
+	 */
+	STATIC_ASSERT(RECOVERY_JOURNAL_ENTRIES_PER_BLOCK
+		      <= ((VDO_BLOCK_SIZE - sizeof(struct packed_journal_header)) /
+			  sizeof(struct packed_recovery_journal_entry)));
+
+	/*
+	 * Allocate a full block for the journal block even though not all of the space is used
+	 * since the VIO needs to write a full disk block.
+	 */
+	result = UDS_ALLOCATE(VDO_BLOCK_SIZE, char, __func__, &data);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = allocate_vio_components(vdo,
+					 VIO_TYPE_RECOVERY_JOURNAL,
+					 VIO_PRIORITY_HIGH,
+					 block,
+					 1,
+					 data,
+					 &block->vio);
+	if (result != VDO_SUCCESS) {
+		UDS_FREE(data);
+		return result;
+	}
+
+	list_add_tail(&block->list_node, &journal->free_tail_blocks);
+	block->journal = journal;
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_decode_recovery_journal() - Make a recovery journal and initialize it with the state that
+ *                                 was decoded from the super block.
+ *
+ * @state: The decoded state of the journal.
+ * @nonce: The nonce of the VDO.
+ * @vdo: The VDO.
+ * @partition: The partition for the journal.
+ * @recovery_count: The VDO's number of completed recoveries.
+ * @journal_size: The number of blocks in the journal on disk.
+ * @journal_ptr: The pointer to hold the new recovery journal.
+ *
+ * Return: A success or error code.
+ */
+int vdo_decode_recovery_journal(struct recovery_journal_state_7_0 state,
+				nonce_t nonce,
+				struct vdo *vdo,
+				struct partition *partition,
+				u64 recovery_count,
+				block_count_t journal_size,
+				struct recovery_journal **journal_ptr)
+{
+	block_count_t i;
+	struct recovery_journal *journal;
+	int result;
+
+	result = UDS_ALLOCATE_EXTENDED(struct recovery_journal,
+				       RECOVERY_JOURNAL_RESERVED_BLOCKS,
+				       struct recovery_journal_block,
+				       __func__,
+				       &journal);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	INIT_LIST_HEAD(&journal->free_tail_blocks);
+	INIT_LIST_HEAD(&journal->active_tail_blocks);
+	vdo_initialize_wait_queue(&journal->pending_writes);
+
+	journal->thread_id = vdo->thread_config.journal_thread;
+	journal->origin = partition->offset;
+	journal->nonce = nonce;
+	journal->recovery_count = compute_recovery_count_byte(recovery_count);
+	journal->size = journal_size;
+	journal->slab_journal_commit_threshold = (journal_size * 2) / 3;
+	journal->logical_blocks_used = state.logical_blocks_used;
+	journal->block_map_data_blocks = state.block_map_data_blocks;
+	journal->entries_per_block = RECOVERY_JOURNAL_ENTRIES_PER_BLOCK;
+	set_journal_tail(journal, state.journal_start);
+	initialize_journal_state(journal);
+	/* TODO: this will have to change if we make initial resume of a VDO a real resume */
+	vdo_set_admin_state_code(&journal->state, VDO_ADMIN_STATE_SUSPENDED);
+
+	for (i = 0; i < RECOVERY_JOURNAL_RESERVED_BLOCKS; i++) {
+		struct recovery_journal_block *block = &journal->blocks[i];
+
+		result = initialize_recovery_block(vdo, journal, block);
+		if (result != VDO_SUCCESS) {
+			vdo_free_recovery_journal(journal);
+			return result;
+		}
+	}
+
+	result = initialize_lock_counter(journal, vdo);
+	if (result != VDO_SUCCESS) {
+		vdo_free_recovery_journal(journal);
+		return result;
+	}
+
+	result = create_metadata_vio(vdo,
+				     VIO_TYPE_RECOVERY_JOURNAL,
+				     VIO_PRIORITY_HIGH,
+				     journal,
+				     NULL,
+				     &journal->flush_vio);
+	if (result != VDO_SUCCESS) {
+		vdo_free_recovery_journal(journal);
+		return result;
+	}
+
+	result = vdo_register_read_only_listener(vdo,
+						 journal,
+						 notify_recovery_journal_of_read_only_mode,
+						 journal->thread_id);
+	if (result != VDO_SUCCESS) {
+		vdo_free_recovery_journal(journal);
+		return result;
+	}
+
+	result = vdo_make_default_thread(vdo, journal->thread_id);
+	if (result != VDO_SUCCESS) {
+		vdo_free_recovery_journal(journal);
+		return result;
+	}
+
+	journal->flush_vio->completion.callback_thread_id = journal->thread_id;
+	*journal_ptr = journal;
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_free_recovery_journal() - Free a recovery journal.
+ * @journal: The recovery journal to free.
+ */
+void vdo_free_recovery_journal(struct recovery_journal *journal)
+{
+	block_count_t i;
+
+	if (journal == NULL)
+		return;
+
+	UDS_FREE(UDS_FORGET(journal->lock_counter.logical_zone_counts));
+	UDS_FREE(UDS_FORGET(journal->lock_counter.physical_zone_counts));
+	UDS_FREE(UDS_FORGET(journal->lock_counter.journal_counters));
+	UDS_FREE(UDS_FORGET(journal->lock_counter.journal_decrement_counts));
+	UDS_FREE(UDS_FORGET(journal->lock_counter.logical_counters));
+	UDS_FREE(UDS_FORGET(journal->lock_counter.physical_counters));
+	free_vio(UDS_FORGET(journal->flush_vio));
+
+	/*
+	 * FIXME: eventually, the journal should be constructed in a quiescent state which
+	 *        requires opening before use.
+	 */
+	if (!vdo_is_state_quiescent(&journal->state))
+		ASSERT_LOG_ONLY(list_empty(&journal->active_tail_blocks),
+				"journal being freed has no active tail blocks");
+	else if (!vdo_is_state_saved(&journal->state) && !list_empty(&journal->active_tail_blocks))
+		uds_log_warning("journal being freed has uncommitted entries");
+
+	for (i = 0; i < RECOVERY_JOURNAL_RESERVED_BLOCKS; i++) {
+		struct recovery_journal_block *block = &journal->blocks[i];
+
+		UDS_FREE(UDS_FORGET(block->vio.data));
+		free_vio_components(&block->vio);
+	}
+
+	UDS_FREE(journal);
+}
+
+/**
+ * vdo_initialize_recovery_journal_post_repair() - Initialize the journal after a repair.
+ * @journal: The journal in question.
+ * @recovery_count: The number of completed recoveries.
+ * @tail: The new tail block sequence number.
+ * @logical_blocks_used: The new number of logical blocks used.
+ * @block_map_data_blocks: The new number of block map data blocks.
+ */
+void vdo_initialize_recovery_journal_post_repair(struct recovery_journal *journal,
+						  u64 recovery_count,
+						  sequence_number_t tail,
+						  block_count_t logical_blocks_used,
+						  block_count_t block_map_data_blocks)
+{
+	set_journal_tail(journal, tail + 1);
+	journal->recovery_count = compute_recovery_count_byte(recovery_count);
+	initialize_journal_state(journal);
+	journal->logical_blocks_used = logical_blocks_used;
+	journal->block_map_data_blocks = block_map_data_blocks;
+}
+
+/**
+ * vdo_get_journal_block_map_data_blocks_used() - Get the number of block map pages, allocated from
+ *                                                data blocks, currently in use.
+ * @journal: The journal in question.
+ *
+ * Return: The number of block map pages allocated from slabs.
+ */
+block_count_t vdo_get_journal_block_map_data_blocks_used(struct recovery_journal *journal)
+{
+	return journal->block_map_data_blocks;
+}
+
+/**
+ * vdo_get_recovery_journal_thread_id() - Get the ID of a recovery journal's thread.
+ * @journal: The journal to query.
+ *
+ * Return: The ID of the journal's thread.
+ */
+thread_id_t vdo_get_recovery_journal_thread_id(struct recovery_journal *journal)
+{
+	return journal->thread_id;
+}
+
+/**
+ * vdo_open_recovery_journal() - Prepare the journal for new entries.
+ * @journal: The journal in question.
+ * @depot: The slab depot for this VDO.
+ * @block_map: The block map for this VDO.
+ */
+void vdo_open_recovery_journal(struct recovery_journal *journal,
+			       struct slab_depot *depot,
+			       struct block_map *block_map)
+{
+	journal->depot = depot;
+	journal->block_map = block_map;
+	WRITE_ONCE(journal->state.current_state,
+		   VDO_ADMIN_STATE_NORMAL_OPERATION);
+}
+
+/**
+ * vdo_record_recovery_journal() - Record the state of a recovery journal for encoding in the super
+ *                                 block.
+ * @journal: the recovery journal.
+ *
+ * Return: the state of the journal.
+ */
+struct recovery_journal_state_7_0
+vdo_record_recovery_journal(const struct recovery_journal *journal)
+{
+	struct recovery_journal_state_7_0 state = {
+		.logical_blocks_used = journal->logical_blocks_used,
+		.block_map_data_blocks = journal->block_map_data_blocks,
+	};
+
+	if (vdo_is_state_saved(&journal->state))
+		/*
+		 * If the journal is saved, we should start one past the active block (since the
+		 * active block is not guaranteed to be empty).
+		 */
+		state.journal_start = journal->tail;
+	else
+		/*
+		 * When we're merely suspended or have gone read-only, we must record the first
+		 * block that might have entries that need to be applied.
+		 */
+		state.journal_start = get_recovery_journal_head(journal);
+
+	return state;
+}
+
+/**
+ * get_block_header() - Get a pointer to the packed journal block header in the block buffer.
+ * @block: The recovery block.
+ *
+ * Return: The block's header.
+ */
+static inline struct packed_journal_header *
+get_block_header(const struct recovery_journal_block *block)
+{
+	return (struct packed_journal_header *) block->vio.data;
+}
+
+/**
+ * set_active_sector() - Set the current sector of the current block and initialize it.
+ * @block: The block to update.
+ * @sector: A pointer to the first byte of the new sector.
+ */
+static void set_active_sector(struct recovery_journal_block *block, void *sector)
+{
+	block->sector = (struct packed_journal_sector *) sector;
+	block->sector->check_byte = get_block_header(block)->check_byte;
+	block->sector->recovery_count = block->journal->recovery_count;
+	block->sector->entry_count = 0;
+}
+
+/**
+ * advance_tail() - Advance the tail of the journal.
+ * @journal: The journal whose tail should be advanced.
+ *
+ * Return: true if the tail was advanced.
+ */
+static bool advance_tail(struct recovery_journal *journal)
+{
+	struct recovery_block_header unpacked;
+	struct packed_journal_header *header;
+	struct recovery_journal_block *block;
+
+	block = journal->active_block = pop_free_list(journal);
+	if (block == NULL)
+		return false;
+
+	list_move_tail(&block->list_node, &journal->active_tail_blocks);
+
+	unpacked = (struct recovery_block_header) {
+		.metadata_type = VDO_METADATA_RECOVERY_JOURNAL_2,
+		.block_map_data_blocks = journal->block_map_data_blocks,
+		.logical_blocks_used = journal->logical_blocks_used,
+		.nonce = journal->nonce,
+		.recovery_count = journal->recovery_count,
+		.sequence_number = journal->tail,
+		.check_byte = vdo_compute_recovery_journal_check_byte(journal, journal->tail),
+	};
+
+	header = get_block_header(block);
+	memset(block->vio.data, 0x0, VDO_BLOCK_SIZE);
+	block->sequence_number = journal->tail;
+	block->entry_count = 0;
+	block->uncommitted_entry_count = 0;
+	block->block_number = vdo_get_recovery_journal_block_number(journal, journal->tail);
+
+	vdo_pack_recovery_block_header(&unpacked, header);
+	set_active_sector(block, vdo_get_journal_block_sector(header, 1));
+	set_journal_tail(journal, journal->tail + 1);
+	vdo_advance_block_map_era(journal->block_map, journal->tail);
+	return true;
+}
+
+/**
+ * initialize_lock_count() - Initialize the value of the journal zone's counter for a given lock.
+ * @journal: The recovery journal.
+ *
+ * Context: This must be called from the journal zone.
+ */
+static void initialize_lock_count(struct recovery_journal *journal)
+{
+	u16 *journal_value;
+	block_count_t lock_number = journal->active_block->block_number;
+	atomic_t *decrement_counter = get_decrement_counter(journal, lock_number);
+
+	journal_value = get_counter(journal, lock_number, VDO_ZONE_TYPE_JOURNAL, 0);
+	ASSERT_LOG_ONLY((*journal_value == atomic_read(decrement_counter)),
+			"count to be initialized not in use");
+	*journal_value = journal->entries_per_block + 1;
+	atomic_set(decrement_counter, 0);
+}
+
+/**
+ * prepare_to_assign_entry() - Prepare the currently active block to receive an entry and check
+ *			       whether an entry of the given type may be assigned at this time.
+ * @journal: The journal receiving an entry.
+ *
+ * Return: true if there is space in the journal to store an entry of the specified type.
+ */
+static bool prepare_to_assign_entry(struct recovery_journal *journal)
+{
+	if (journal->available_space == 0)
+		return false;
+
+	if (is_block_full(journal->active_block) && !advance_tail(journal))
+		return false;
+
+	if (!is_block_empty(journal->active_block))
+		return true;
+
+	if ((journal->tail - get_recovery_journal_head(journal)) > journal->size) {
+		/* Cannot use this block since the journal is full. */
+		journal->events.disk_full++;
+		return false;
+	}
+
+	/*
+	 * Don't allow the new block to be reaped until all of its entries have been committed to
+	 * the block map and until the journal block has been fully committed as well. Because the
+	 * block map update is done only after any slab journal entries have been made, the
+	 * per-entry lock for the block map entry serves to protect those as well.
+	 */
+	initialize_lock_count(journal);
+	return true;
+}
+
+static void write_blocks(struct recovery_journal *journal);
+
+/**
+ * schedule_block_write() - Queue a block for writing.
+ * @journal: The journal in question.
+ * @block: The block which is now ready to write.
+ *
+ * The block is expected to be full. If the block is currently writing, this is a noop as the block
+ * will be queued for writing when the write finishes. The block must not currently be queued for
+ * writing.
+ */
+static void
+schedule_block_write(struct recovery_journal *journal, struct recovery_journal_block *block)
+{
+	if (!block->committing)
+		vdo_enqueue_waiter(&journal->pending_writes, &block->write_waiter);
+	/*
+	 * At the end of adding entries, or discovering this partial block is now full and ready to
+	 * rewrite, we will call write_blocks() and write a whole batch.
+	 */
+}
+
+/**
+ * release_journal_block_reference() - Release a reference to a journal block.
+ * @block: The journal block from which to release a reference.
+ */
+static void release_journal_block_reference(struct recovery_journal_block *block)
+{
+	vdo_release_recovery_journal_block_reference(block->journal,
+						     block->sequence_number,
+						     VDO_ZONE_TYPE_JOURNAL,
+						     0);
+}
+
+static void update_usages(struct recovery_journal *journal, struct data_vio *data_vio)
+{
+	if (data_vio->increment_updater.operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
+		journal->block_map_data_blocks++;
+		return;
+	}
+
+	if (data_vio->new_mapped.state != VDO_MAPPING_STATE_UNMAPPED)
+		journal->logical_blocks_used++;
+
+	if (data_vio->mapped.state != VDO_MAPPING_STATE_UNMAPPED)
+		journal->logical_blocks_used--;
+}
+
+/**
+ * assign_entry() - Assign an entry waiter to the active block.
+ *
+ * Implements waiter_callback.
+ */
+static void assign_entry(struct waiter *waiter, void *context)
+{
+	struct data_vio *data_vio = waiter_as_data_vio(waiter);
+	struct recovery_journal_block *block = (struct recovery_journal_block *) context;
+	struct recovery_journal *journal = block->journal;
+
+	/* Record the point at which we will make the journal entry. */
+	data_vio->recovery_journal_point = (struct journal_point) {
+		.sequence_number = block->sequence_number,
+		.entry_count = block->entry_count,
+	};
+
+	update_usages(journal, data_vio);
+	journal->available_space--;
+
+	if (!vdo_has_waiters(&block->entry_waiters))
+		journal->events.blocks.started++;
+
+	vdo_enqueue_waiter(&block->entry_waiters, &data_vio->waiter);
+	block->entry_count++;
+	block->uncommitted_entry_count++;
+	journal->events.entries.started++;
+
+	if (is_block_full(block))
+		/*
+		 * The block is full, so we can write it anytime henceforth. If it is already
+		 * committing, we'll queue it for writing when it comes back.
+		 */
+		schedule_block_write(journal, block);
+
+	/* Force out slab journal tail blocks when threshold is reached. */
+	check_slab_journal_commit_threshold(journal);
+}
+
+static void assign_entries(struct recovery_journal *journal)
+{
+	if (journal->adding_entries)
+		/* Protect against re-entrancy. */
+		return;
+
+	journal->adding_entries = true;
+	while (vdo_has_waiters(&journal->entry_waiters) && prepare_to_assign_entry(journal))
+		vdo_notify_next_waiter(&journal->entry_waiters,
+				       assign_entry,
+				       journal->active_block);
+
+	/* Now that we've finished with entries, see if we have a batch of blocks to write. */
+	write_blocks(journal);
+	journal->adding_entries = false;
+}
+
+/**
+ * recycle_journal_block() - Prepare an in-memory journal block to be reused now that it has been
+ *                           fully committed.
+ * @block: The block to be recycled.
+ */
+static void recycle_journal_block(struct recovery_journal_block *block)
+{
+	struct recovery_journal *journal = block->journal;
+	block_count_t i;
+
+	list_move_tail(&block->list_node, &journal->free_tail_blocks);
+
+	/* Release any unused entry locks. */
+	for (i = block->entry_count; i < journal->entries_per_block; i++)
+		release_journal_block_reference(block);
+
+	/*
+	 * Release our own lock against reaping now that the block is completely committed, or
+	 * we're giving up because we're in read-only mode.
+	 */
+	if (block->entry_count > 0)
+		release_journal_block_reference(block);
+
+	if (block == journal->active_block)
+		journal->active_block = NULL;
+}
+
+/**
+ * continue_committed_waiter() - invoked whenever a VIO is to be released from the journal because
+ *                               its entry was committed to disk.
+ *
+ * Implements waiter_callback.
+ */
+static void continue_committed_waiter(struct waiter *waiter, void *context)
+{
+	struct data_vio *data_vio = waiter_as_data_vio(waiter);
+	struct recovery_journal *journal = (struct recovery_journal *)context;
+	int result = (is_read_only(journal) ? VDO_READ_ONLY : VDO_SUCCESS);
+	bool has_decrement;
+
+	ASSERT_LOG_ONLY(vdo_before_journal_point(&journal->commit_point,
+						 &data_vio->recovery_journal_point),
+			"DataVIOs released from recovery journal in order. Recovery journal point is (%llu, %u), but commit waiter point is (%llu, %u)",
+			(unsigned long long) journal->commit_point.sequence_number,
+			journal->commit_point.entry_count,
+			(unsigned long long) data_vio->recovery_journal_point.sequence_number,
+			data_vio->recovery_journal_point.entry_count);
+
+	journal->commit_point = data_vio->recovery_journal_point;
+	data_vio->last_async_operation = VIO_ASYNC_OP_UPDATE_REFERENCE_COUNTS;
+	if (result != VDO_SUCCESS) {
+		continue_data_vio_with_error(data_vio, result);
+		return;
+	}
+
+	/*
+	 * The increment must be launched first since it must come before the
+	 * decrement if they are in the same slab.
+	 */
+	has_decrement = (data_vio->decrement_updater.zpbn.pbn != VDO_ZERO_BLOCK);
+	if ((data_vio->increment_updater.zpbn.pbn != VDO_ZERO_BLOCK) || !has_decrement)
+		continue_data_vio(data_vio);
+
+	if (has_decrement)
+		vdo_launch_completion(&data_vio->decrement_completion);
+}
+
+/**
+ * notify_commit_waiters() - Notify any VIOs whose entries have now committed.
+ * @journal: The recovery journal to update.
+ */
+static void notify_commit_waiters(struct recovery_journal *journal)
+{
+	struct recovery_journal_block *block;
+
+	list_for_each_entry(block, &journal->active_tail_blocks, list_node) {
+		if (block->committing)
+			return;
+
+		vdo_notify_all_waiters(&block->commit_waiters, continue_committed_waiter, journal);
+		if (is_read_only(journal))
+			vdo_notify_all_waiters(&block->entry_waiters,
+					       continue_committed_waiter,
+					       journal);
+		else if (is_block_dirty(block) || !is_block_full(block))
+			/* Stop at partially-committed or partially-filled blocks. */
+			return;
+	}
+}
+
+/**
+ * recycle_journal_blocks() - Recycle any journal blocks which have been fully committed.
+ * @journal: The recovery journal to update.
+ */
+static void recycle_journal_blocks(struct recovery_journal *journal)
+{
+	struct recovery_journal_block *block, *tmp;
+
+	list_for_each_entry_safe(block, tmp, &journal->active_tail_blocks, list_node) {
+		if (block->committing)
+			/* Don't recycle committing blocks. */
+			return;
+
+		if (!is_read_only(journal) && (is_block_dirty(block) || !is_block_full(block)))
+			/*
+			 * Don't recycle partially written or partially full blocks, except in
+			 * read-only mode.
+			 */
+			return;
+
+		recycle_journal_block(block);
+	}
+}
+
+/**
+ * complete_write() - Handle post-commit processing.
+ * @completion: The completion of the VIO writing this block.
+ *
+ * This is the callback registered by write_block(). If more entries accumulated in the block being
+ * committed while the commit was in progress, another commit will be initiated.
+ */
+static void complete_write(struct vdo_completion *completion)
+{
+	struct recovery_journal_block *block = completion->parent;
+	struct recovery_journal *journal = block->journal;
+	struct recovery_journal_block *last_active_block;
+
+	assert_on_journal_thread(journal, __func__);
+
+	journal->pending_write_count -= 1;
+	journal->events.blocks.committed += 1;
+	journal->events.entries.committed += block->entries_in_commit;
+	block->uncommitted_entry_count -= block->entries_in_commit;
+	block->entries_in_commit = 0;
+	block->committing = false;
+
+	/* If this block is the latest block to be acknowledged, record that fact. */
+	if (block->sequence_number > journal->last_write_acknowledged)
+		journal->last_write_acknowledged = block->sequence_number;
+
+	last_active_block = get_journal_block(&journal->active_tail_blocks);
+	ASSERT_LOG_ONLY((block->sequence_number >= last_active_block->sequence_number),
+			"completed journal write is still active");
+
+	notify_commit_waiters(journal);
+
+	/*
+	 * Is this block now full? Reaping, and adding entries, might have already sent it off for
+	 * rewriting; else, queue it for rewrite.
+	 */
+	if (is_block_dirty(block) && is_block_full(block))
+		schedule_block_write(journal, block);
+
+	recycle_journal_blocks(journal);
+	write_blocks(journal);
+
+	check_for_drain_complete(journal);
+}
+
+static void handle_write_error(struct vdo_completion *completion)
+{
+	struct recovery_journal_block *block = completion->parent;
+	struct recovery_journal *journal = block->journal;
+
+	vio_record_metadata_io_error(as_vio(completion));
+	uds_log_error_strerror(completion->result,
+			       "cannot write recovery journal block %llu",
+			       (unsigned long long) block->sequence_number);
+	enter_journal_read_only_mode(journal, completion->result);
+	complete_write(completion);
+}
+
+static void complete_write_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct recovery_journal_block *block = vio->completion.parent;
+	struct recovery_journal *journal = block->journal;
+
+	continue_vio_after_io(vio, complete_write, journal->thread_id);
+}
+
+/**
+ * add_queued_recovery_entries() - Actually add entries from the queue to the given block.
+ * @block: The journal block.
+ */
+static void add_queued_recovery_entries(struct recovery_journal_block *block)
+{
+	while (vdo_has_waiters(&block->entry_waiters)) {
+		struct data_vio *data_vio =
+			waiter_as_data_vio(vdo_dequeue_next_waiter(&block->entry_waiters));
+		struct tree_lock *lock = &data_vio->tree_lock;
+		struct packed_recovery_journal_entry *packed_entry;
+		struct recovery_journal_entry new_entry;
+
+		if (block->sector->entry_count == RECOVERY_JOURNAL_ENTRIES_PER_SECTOR)
+			set_active_sector(block, (char *) block->sector + VDO_SECTOR_SIZE);
+
+		/* Compose and encode the entry. */
+		packed_entry = &block->sector->entries[block->sector->entry_count++];
+		new_entry = (struct recovery_journal_entry) {
+			.mapping = {
+				.pbn = data_vio->increment_updater.zpbn.pbn,
+				.state = data_vio->increment_updater.zpbn.state,
+			},
+			.unmapping = {
+				.pbn = data_vio->decrement_updater.zpbn.pbn,
+				.state = data_vio->decrement_updater.zpbn.state,
+			},
+			.operation = data_vio->increment_updater.operation,
+			.slot = lock->tree_slots[lock->height].block_map_slot,
+		};
+		*packed_entry = vdo_pack_recovery_journal_entry(&new_entry);
+		data_vio->recovery_sequence_number = block->sequence_number;
+
+		/* Enqueue the data_vio to wait for its entry to commit. */
+		vdo_enqueue_waiter(&block->commit_waiters, &data_vio->waiter);
+	}
+}
+
+/**
+ * write_block() - Issue a block for writing.
+ *
+ * Implements waiter_callback.
+ */
+static void write_block(struct waiter *waiter, void *context __always_unused)
+{
+	struct recovery_journal_block *block =
+		container_of(waiter, struct recovery_journal_block, write_waiter);
+	struct recovery_journal *journal = block->journal;
+	struct packed_journal_header *header = get_block_header(block);
+
+	if (block->committing || !vdo_has_waiters(&block->entry_waiters) || is_read_only(journal))
+		return;
+
+	block->entries_in_commit = vdo_count_waiters(&block->entry_waiters);
+	add_queued_recovery_entries(block);
+
+	journal->pending_write_count += 1;
+	journal->events.blocks.written += 1;
+	journal->events.entries.written += block->entries_in_commit;
+
+	header->block_map_head = __cpu_to_le64(journal->block_map_head);
+	header->slab_journal_head = __cpu_to_le64(journal->slab_journal_head);
+	header->entry_count = __cpu_to_le16(block->entry_count);
+
+	block->committing = true;
+
+	/*
+	 * We must issue a flush and a FUA for every commit. The flush is necessary to ensure that
+	 * the data being referenced is stable. The FUA is necessary to ensure that the journal
+	 * block itself is stable before allowing overwrites of the lbn's previous data.
+	 */
+	submit_metadata_vio(&block->vio,
+			    journal->origin + block->block_number,
+			    complete_write_endio,
+			    handle_write_error,
+			    WRITE_FLAGS);
+}
+
+
+/**
+ * write_blocks() - Attempt to commit blocks, according to write policy.
+ * @journal: The recovery journal.
+ */
+static void write_blocks(struct recovery_journal *journal)
+{
+	assert_on_journal_thread(journal, __func__);
+	/*
+	 * We call this function after adding entries to the journal and after finishing a block
+	 * write. Thus, when this function terminates we must either have no VIOs waiting in the
+	 * journal or have some outstanding IO to provide a future wakeup.
+	 *
+	 * We want to only issue full blocks if there are no pending writes. However, if there are
+	 * no outstanding writes and some unwritten entries, we must issue a block, even if it's
+	 * the active block and it isn't full.
+	 */
+	if (journal->pending_write_count > 0)
+		return;
+
+	/* Write all the full blocks. */
+	vdo_notify_all_waiters(&journal->pending_writes, write_block, NULL);
+
+	/*
+	 * Do we need to write the active block? Only if we have no outstanding writes, even after
+	 * issuing all of the full writes.
+	 */
+	if ((journal->pending_write_count == 0) && (journal->active_block != NULL))
+		write_block(&journal->active_block->write_waiter, NULL);
+}
+
+/**
+ * vdo_add_recovery_journal_entry() - Add an entry to a recovery journal.
+ * @journal: The journal in which to make an entry.
+ * @data_vio: The data_vio for which to add the entry. The entry will be taken
+ *	      from the logical and new_mapped fields of the data_vio. The
+ *	      data_vio's recovery_sequence_number field will be set to the
+ *	      sequence number of the journal block in which the entry was
+ *	      made.
+ *
+ * This method is asynchronous. The data_vio will not be called back until the entry is committed
+ * to the on-disk journal.
+ */
+void vdo_add_recovery_journal_entry(struct recovery_journal *journal, struct data_vio *data_vio)
+{
+	assert_on_journal_thread(journal, __func__);
+	if (!vdo_is_state_normal(&journal->state)) {
+		continue_data_vio_with_error(data_vio, VDO_INVALID_ADMIN_STATE);
+		return;
+	}
+
+	if (is_read_only(journal)) {
+		continue_data_vio_with_error(data_vio, VDO_READ_ONLY);
+		return;
+	}
+
+	ASSERT_LOG_ONLY(data_vio->recovery_sequence_number == 0,
+			"journal lock not held for new entry");
+
+	vdo_advance_journal_point(&journal->append_point, journal->entries_per_block);
+	vdo_enqueue_waiter(&journal->entry_waiters, &data_vio->waiter);
+	assign_entries(journal);
+}
+
+/**
+ * is_lock_locked() - Check whether a lock is locked for a zone type.
+ * @journal: The recovery journal.
+ * @lock_number: The lock to check.
+ * @zone_type: The type of the zone.
+ *
+ * If the recovery journal has a lock on the lock number, both logical and physical zones are
+ * considered locked.
+ *
+ * Return: true if the specified lock has references (is locked).
+ */
+static bool
+is_lock_locked(struct recovery_journal *journal,
+	       block_count_t lock_number,
+	       enum vdo_zone_type zone_type)
+{
+	atomic_t *zone_count;
+	bool locked;
+
+	if (is_journal_zone_locked(journal, lock_number))
+		return true;
+
+	zone_count = get_zone_count_ptr(journal, lock_number, zone_type);
+	locked = (atomic_read(zone_count) != 0);
+	/* Pairs with implicit barrier in vdo_release_recovery_journal_block_reference() */
+	smp_rmb();
+	return locked;
+}
+
+/**
+ * reap_recovery_journal() - Conduct a sweep on a recovery journal to reclaim unreferenced blocks.
+ * @journal: The recovery journal.
+ */
+static void reap_recovery_journal(struct recovery_journal *journal)
+{
+	if (journal->reaping)
+		/*
+		 * We already have an outstanding reap in progress. We need to wait for it to
+		 * finish.
+		 */
+		return;
+
+	if (vdo_is_state_quiescent(&journal->state))
+		/* We are supposed to not do IO. Don't botch it by reaping. */
+		return;
+
+	/*
+	 * Start reclaiming blocks only when the journal head has no references. Then stop when a
+	 * block is referenced.
+	 */
+	while ((journal->block_map_reap_head < journal->last_write_acknowledged) &&
+		!is_lock_locked(journal,
+				journal->block_map_head_block_number,
+				VDO_ZONE_TYPE_LOGICAL)) {
+		journal->block_map_reap_head++;
+		if (++journal->block_map_head_block_number == journal->size)
+			journal->block_map_head_block_number = 0;
+	}
+
+	while ((journal->slab_journal_reap_head < journal->last_write_acknowledged) &&
+		!is_lock_locked(journal,
+				journal->slab_journal_head_block_number,
+				VDO_ZONE_TYPE_PHYSICAL)) {
+		journal->slab_journal_reap_head++;
+		if (++journal->slab_journal_head_block_number == journal->size)
+			journal->slab_journal_head_block_number = 0;
+	}
+
+	if ((journal->block_map_reap_head == journal->block_map_head) &&
+	    (journal->slab_journal_reap_head == journal->slab_journal_head))
+		/* Nothing happened. */
+		return;
+
+	/*
+	 * If the block map head will advance, we must flush any block map page modified by the
+	 * entries we are reaping. If the slab journal head will advance, we must flush the slab
+	 * summary update covering the slab journal that just released some lock.
+	 */
+	journal->reaping = true;
+	submit_flush_vio(journal->flush_vio, flush_endio, handle_flush_error);
+}
+
+/**
+ * vdo_acquire_recovery_journal_block_reference() - Acquire a reference to a recovery journal block
+ *                                                  from somewhere other than the journal itself.
+ * @journal: The recovery journal.
+ * @sequence_number: The journal sequence number of the referenced block.
+ * @zone_type: The type of the zone making the adjustment.
+ * @zone_id: The ID of the zone making the adjustment.
+ */
+void vdo_acquire_recovery_journal_block_reference(struct recovery_journal *journal,
+						  sequence_number_t sequence_number,
+						  enum vdo_zone_type zone_type,
+						  zone_count_t zone_id)
+{
+	block_count_t lock_number;
+	u16 *current_value;
+
+	if (sequence_number == 0)
+		return;
+
+	ASSERT_LOG_ONLY((zone_type != VDO_ZONE_TYPE_JOURNAL),
+			"invalid lock count increment from journal zone");
+
+	lock_number = vdo_get_recovery_journal_block_number(journal, sequence_number);
+	current_value = get_counter(journal, lock_number, zone_type, zone_id);
+	ASSERT_LOG_ONLY(*current_value < U16_MAX,
+			"increment of lock counter must not overflow");
+
+	if (*current_value == 0) {
+		/*
+		 * This zone is acquiring this lock for the first time. Extra barriers because this
+		 * was original developed using an atomic add operation that implicitly had them.
+		 */
+		smp_mb__before_atomic();
+		atomic_inc(get_zone_count_ptr(journal, lock_number, zone_type));
+		/* same as before_atomic */
+		smp_mb__after_atomic();
+	}
+	*current_value += 1;
+}
+
+/**
+ * vdo_release_journal_entry_lock() - Release a single per-entry reference count for a recovery
+ *                                    journal block.
+ * @journal: The recovery journal.
+ * @sequence_number: The journal sequence number of the referenced block.
+ */
+void vdo_release_journal_entry_lock(struct recovery_journal *journal,
+				    sequence_number_t sequence_number)
+{
+	block_count_t lock_number;
+
+	if (sequence_number == 0)
+		return;
+
+	lock_number = vdo_get_recovery_journal_block_number(journal, sequence_number);
+	/*
+	 * Extra barriers because this was originally developed using an atomic add operation that
+	 * implicitly had them.
+	 */
+	smp_mb__before_atomic();
+	atomic_inc(get_decrement_counter(journal, lock_number));
+	/* same as before_atomic */
+	smp_mb__after_atomic();
+}
+
+/**
+ * initiate_drain() - Initiate a drain.
+ *
+ * Implements vdo_admin_initiator.
+ */
+static void initiate_drain(struct admin_state *state)
+{
+	check_for_drain_complete(container_of(state, struct recovery_journal, state));
+}
+
+/**
+ * vdo_drain_recovery_journal() - Drain recovery journal I/O.
+ * @journal: The journal to drain.
+ * @operation: The drain operation (suspend or save).
+ * @parent: The completion to notify once the journal is drained.
+ *
+ * All uncommitted entries will be written out.
+ */
+void vdo_drain_recovery_journal(struct recovery_journal *journal,
+				const struct admin_state_code *operation,
+				struct vdo_completion *parent)
+{
+	assert_on_journal_thread(journal, __func__);
+	vdo_start_draining(&journal->state, operation, parent, initiate_drain);
+}
+
+/**
+ * resume_lock_counter() - Re-allow notifications from a suspended lock counter.
+ * @counter: The counter.
+ *
+ * Return: true if the lock counter was suspended.
+ */
+static bool resume_lock_counter(struct lock_counter *counter)
+{
+	int prior_state;
+
+	/*
+	 * Extra barriers because this was original developed using a CAS operation that implicitly
+	 * had them.
+	 */
+	smp_mb__before_atomic();
+	prior_state = atomic_cmpxchg(&counter->state,
+				     LOCK_COUNTER_STATE_SUSPENDED,
+				     LOCK_COUNTER_STATE_NOT_NOTIFYING);
+	/* same as before_atomic */
+	smp_mb__after_atomic();
+
+	return (prior_state == LOCK_COUNTER_STATE_SUSPENDED);
+}
+
+/**
+ * vdo_resume_recovery_journal() - Resume a recovery journal which has been drained.
+ * @journal: The journal to resume.
+ * @parent: The completion to finish once the journal is resumed.
+ */
+void vdo_resume_recovery_journal(struct recovery_journal *journal, struct vdo_completion *parent)
+{
+	bool saved;
+
+	assert_on_journal_thread(journal, __func__);
+	saved = vdo_is_state_saved(&journal->state);
+	vdo_set_completion_result(parent, vdo_resume_if_quiescent(&journal->state));
+	if (is_read_only(journal)) {
+		vdo_continue_completion(parent, VDO_READ_ONLY);
+		return;
+	}
+
+	if (saved)
+		initialize_journal_state(journal);
+
+	if (resume_lock_counter(&journal->lock_counter))
+		/* We might have missed a notification. */
+		reap_recovery_journal(journal);
+
+	vdo_launch_completion(parent);
+}
+
+/**
+ * vdo_get_recovery_journal_logical_blocks_used() - Get the number of logical blocks in use by the
+ *                                                  VDO.
+ * @journal: The journal.
+ *
+ * Return: The number of logical blocks in use by the VDO.
+ */
+block_count_t vdo_get_recovery_journal_logical_blocks_used(const struct recovery_journal *journal)
+{
+	return journal->logical_blocks_used;
+}
+
+/**
+ * vdo_get_recovery_journal_statistics() - Get the current statistics from the recovery journal.
+ * @journal: The recovery journal to query.
+ *
+ * Return: A copy of the current statistics for the journal.
+ */
+struct recovery_journal_statistics
+vdo_get_recovery_journal_statistics(const struct recovery_journal *journal)
+{
+	return journal->events;
+}
+
+/**
+ * dump_recovery_block() - Dump the contents of the recovery block to the log.
+ * @block: The block to dump.
+ */
+static void dump_recovery_block(const struct recovery_journal_block *block)
+{
+	uds_log_info("    sequence number %llu; entries %u; %s; %zu entry waiters; %zu commit waiters",
+		     (unsigned long long) block->sequence_number,
+		     block->entry_count,
+		     (block->committing ? "committing" : "waiting"),
+		     vdo_count_waiters(&block->entry_waiters),
+		     vdo_count_waiters(&block->commit_waiters));
+}
+
+/**
+ * vdo_dump_recovery_journal_statistics() - Dump some current statistics and other debug info from
+ *                                          the recovery journal.
+ * @journal: The recovery journal to dump.
+ */
+void vdo_dump_recovery_journal_statistics(const struct recovery_journal *journal)
+{
+	const struct recovery_journal_block *block;
+	struct recovery_journal_statistics stats = vdo_get_recovery_journal_statistics(journal);
+
+	uds_log_info("Recovery Journal");
+	uds_log_info("	block_map_head=%llu slab_journal_head=%llu last_write_acknowledged=%llu tail=%llu block_map_reap_head=%llu slab_journal_reap_head=%llu disk_full=%llu slab_journal_commits_requested=%llu entry_waiters=%zu",
+		     (unsigned long long) journal->block_map_head,
+		     (unsigned long long) journal->slab_journal_head,
+		     (unsigned long long) journal->last_write_acknowledged,
+		     (unsigned long long) journal->tail,
+		     (unsigned long long) journal->block_map_reap_head,
+		     (unsigned long long) journal->slab_journal_reap_head,
+		     (unsigned long long) stats.disk_full,
+		     (unsigned long long) stats.slab_journal_commits_requested,
+		     vdo_count_waiters(&journal->entry_waiters));
+	uds_log_info("	entries: started=%llu written=%llu committed=%llu",
+		     (unsigned long long) stats.entries.started,
+		     (unsigned long long) stats.entries.written,
+		     (unsigned long long) stats.entries.committed);
+	uds_log_info("	blocks: started=%llu written=%llu committed=%llu",
+		     (unsigned long long) stats.blocks.started,
+		     (unsigned long long) stats.blocks.written,
+		     (unsigned long long) stats.blocks.committed);
+
+	uds_log_info("	active blocks:");
+	list_for_each_entry(block, &journal->active_tail_blocks, list_node)
+		dump_recovery_block(block);
+}
diff --git a/drivers/md/dm-vdo/recovery-journal.h b/drivers/md/dm-vdo/recovery-journal.h
new file mode 100644
index 00000000000..5845f1e4b4c
--- /dev/null
+++ b/drivers/md/dm-vdo/recovery-journal.h
@@ -0,0 +1,313 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_RECOVERY_JOURNAL_H
+#define VDO_RECOVERY_JOURNAL_H
+
+#include <linux/list.h>
+
+#include "numeric.h"
+
+#include "admin-state.h"
+#include "constants.h"
+#include "encodings.h"
+#include "flush.h"
+#include "statistics.h"
+#include "types.h"
+#include "wait-queue.h"
+
+/**
+ * DOC: recovery journal.
+ *
+ * The recovery_journal provides a log of all block mapping and reference count changes which have
+ * not yet been stably written to the block map or slab journals. This log helps to reduce the
+ * write amplification of writes by providing amortization of slab journal and block map page
+ * updates.
+ *
+ * The journal consists of a set of on-disk blocks arranged as a circular log with monotonically
+ * increasing sequence numbers. Three sequence numbers serve to define the active extent of the
+ * journal. The 'head' is the oldest active block in the journal. The 'tail' is the end of the
+ * half-open interval containing the active blocks. 'active' is the number of the block actively
+ * receiving entries. In an empty journal, head == active == tail. Once any entries are added, tail
+ * = active + 1, and head may be any value in the interval [tail - size, active].
+ *
+ * The journal also contains a set of in-memory blocks which are used to buffer up entries until
+ * they can be committed. In general the number of in-memory blocks ('tail_buffer_count') will be
+ * less than the on-disk size. Each in-memory block is also a vdo_completion. Each in-memory block
+ * has a vio which is used to commit that block to disk. The vio's data is the on-disk
+ * representation of the journal block. In addition each in-memory block has a buffer which is used
+ * to accumulate entries while a partial commit of the block is in progress. In-memory blocks are
+ * kept on two rings. Free blocks live on the 'free_tail_blocks' ring. When a block becomes active
+ * (see below) it is moved to the 'active_tail_blocks' ring. When a block is fully committed, it is
+ * moved back to the 'free_tail_blocks' ring.
+ *
+ * When entries are added to the journal, they are added to the active in-memory block, as
+ * indicated by the 'active_block' field. If the caller wishes to wait for the entry to be
+ * committed, the requesting VIO will be attached to the in-memory block to which the caller's
+ * entry was added. If the caller does wish to wait, or if the entry filled the active block, an
+ * attempt will be made to commit that block to disk. If there is already another commit in
+ * progress, the attempt will be ignored and then automatically retried when the in-progress commit
+ * completes. If there is no commit in progress, any data_vios waiting on the block are transferred
+ * to the block's vio which is then written, automatically waking all of the waiters when it
+ * completes. When the write completes, any entries which accumulated in the block are copied to
+ * the vio's data buffer.
+ *
+ * Finally, the journal maintains a set of counters, one for each on disk journal block. These
+ * counters are used as locks to prevent premature reaping of journal blocks. Each time a new
+ * sequence number is used, the counter for the corresponding block is incremented. The counter is
+ * subsequently decremented when that block is filled and then committed for the last time. This
+ * prevents blocks from being reaped while they are still being updated. The counter is also
+ * incremented once for each entry added to a block, and decremented once each time the block map
+ * is updated in memory for that request. This prevents blocks from being reaped while their VIOs
+ * are still active. Finally, each in-memory block map page tracks the oldest journal block that
+ * contains entries corresponding to uncommitted updates to that block map page. Each time an
+ * in-memory block map page is updated, it checks if the journal block for the VIO is earlier than
+ * the one it references, in which case it increments the count on the earlier journal block and
+ * decrements the count on the later journal block, maintaining a lock on the oldest journal block
+ * containing entries for that page. When a block map page has been flushed from the cache, the
+ * counter for the journal block it references is decremented. Whenever the counter for the head
+ * block goes to 0, the head is advanced until it comes to a block whose counter is not 0 or until
+ * it reaches the active block. This is the mechanism for reclaiming journal space on disk.
+ *
+ * If there is no in-memory space when a VIO attempts to add an entry, the VIO will be attached to
+ * the 'commit_completion' and will be woken the next time a full block has committed. If there is
+ * no on-disk space when a VIO attempts to add an entry, the VIO will be attached to the
+ * 'reap_completion', and will be woken the next time a journal block is reaped.
+ */
+
+enum vdo_zone_type {
+	VDO_ZONE_TYPE_ADMIN,
+	VDO_ZONE_TYPE_JOURNAL,
+	VDO_ZONE_TYPE_LOGICAL,
+	VDO_ZONE_TYPE_PHYSICAL,
+};
+
+struct lock_counter {
+	/** The completion for notifying the owner of a lock release */
+	struct vdo_completion completion;
+	/** The number of logical zones which may hold locks */
+	zone_count_t logical_zones;
+	/** The number of physical zones which may hold locks */
+	zone_count_t physical_zones;
+	/** The number of locks */
+	block_count_t locks;
+	/** Whether the lock release notification is in flight */
+	atomic_t state;
+	/** The number of logical zones which hold each lock */
+	atomic_t *logical_zone_counts;
+	/** The number of physical zones which hold each lock */
+	atomic_t *physical_zone_counts;
+	/** The per-lock counts for the journal zone */
+	u16 *journal_counters;
+	/** The per-lock decrement counts for the journal zone */
+	atomic_t *journal_decrement_counts;
+	/** The per-zone, per-lock reference counts for logical zones */
+	u16 *logical_counters;
+	/** The per-zone, per-lock reference counts for physical zones */
+	u16 *physical_counters;
+};
+
+struct recovery_journal_block {
+	/* The doubly linked pointers for the free or active lists */
+	struct list_head list_node;
+	/* The waiter for the pending full block list */
+	struct waiter write_waiter;
+	/* The journal to which this block belongs */
+	struct recovery_journal *journal;
+	/* A pointer to the current sector in the packed block buffer */
+	struct packed_journal_sector *sector;
+	/* The vio for writing this block */
+	struct vio vio;
+	/* The sequence number for this block */
+	sequence_number_t sequence_number;
+	/* The location of this block in the on-disk journal */
+	physical_block_number_t block_number;
+	/* Whether this block is being committed */
+	bool committing;
+	/* The total number of entries in this block */
+	journal_entry_count_t entry_count;
+	/* The total number of uncommitted entries (queued or committing) */
+	journal_entry_count_t uncommitted_entry_count;
+	/* The number of new entries in the current commit */
+	journal_entry_count_t entries_in_commit;
+	/* The queue of vios which will make entries for the next commit */
+	struct wait_queue entry_waiters;
+	/* The queue of vios waiting for the current commit */
+	struct wait_queue commit_waiters;
+};
+
+struct recovery_journal {
+	/* The thread ID of the journal zone */
+	thread_id_t thread_id;
+	/* The slab depot which can hold locks on this journal */
+	struct slab_depot *depot;
+	/* The block map which can hold locks on this journal */
+	struct block_map *block_map;
+	/* The queue of vios waiting to make entries */
+	struct wait_queue entry_waiters;
+	/* The number of free entries in the journal */
+	u64 available_space;
+	/* The number of decrement entries which need to be made */
+	data_vio_count_t pending_decrement_count;
+	/* Whether the journal is adding entries from the increment or decrement waiters queues */
+	bool adding_entries;
+	/* The administrative state of the journal */
+	struct admin_state state;
+	/* Whether a reap is in progress */
+	bool reaping;
+	/* The location of the first journal block */
+	physical_block_number_t origin;
+	/* The oldest active block in the journal on disk for block map rebuild */
+	sequence_number_t block_map_head;
+	/* The oldest active block in the journal on disk for slab journal replay */
+	sequence_number_t slab_journal_head;
+	/* The newest block in the journal on disk to which a write has finished */
+	sequence_number_t last_write_acknowledged;
+	/* The end of the half-open interval of the active journal */
+	sequence_number_t tail;
+	/* The point at which the last entry will have been added */
+	struct journal_point append_point;
+	/* The journal point of the vio most recently released from the journal */
+	struct journal_point commit_point;
+	/* The nonce of the VDO */
+	nonce_t nonce;
+	/* The number of recoveries completed by the VDO */
+	u8 recovery_count;
+	/* The number of entries which fit in a single block */
+	journal_entry_count_t entries_per_block;
+	/* Unused in-memory journal blocks */
+	struct list_head free_tail_blocks;
+	/* In-memory journal blocks with records */
+	struct list_head active_tail_blocks;
+	/* A pointer to the active block (the one we are adding entries to now) */
+	struct recovery_journal_block *active_block;
+	/* Journal blocks that need writing */
+	struct wait_queue pending_writes;
+	/* The new block map reap head after reaping */
+	sequence_number_t block_map_reap_head;
+	/* The head block number for the block map rebuild range */
+	block_count_t block_map_head_block_number;
+	/* The new slab journal reap head after reaping */
+	sequence_number_t slab_journal_reap_head;
+	/* The head block number for the slab journal replay range */
+	block_count_t slab_journal_head_block_number;
+	/* The data-less vio, usable only for flushing */
+	struct vio *flush_vio;
+	/* The number of blocks in the on-disk journal */
+	block_count_t size;
+	/* The number of logical blocks that are in-use */
+	block_count_t logical_blocks_used;
+	/* The number of block map pages that are allocated */
+	block_count_t block_map_data_blocks;
+	/* The number of journal blocks written but not yet acknowledged */
+	block_count_t pending_write_count;
+	/* The threshold at which slab journal tail blocks will be written out */
+	block_count_t slab_journal_commit_threshold;
+	/* Counters for events in the journal that are reported as statistics */
+	struct recovery_journal_statistics events;
+	/* The locks for each on-disk block */
+	struct lock_counter lock_counter;
+	/* The tail blocks */
+	struct recovery_journal_block blocks[];
+};
+
+/**
+ * vdo_get_recovery_journal_block_number() - Get the physical block number for a given sequence
+ *                                           number.
+ * @journal: The journal.
+ * @sequence: The sequence number of the desired block.
+ *
+ * Return: The block number corresponding to the sequence number.
+ */
+static inline physical_block_number_t __must_check
+vdo_get_recovery_journal_block_number(const struct recovery_journal *journal,
+				      sequence_number_t sequence)
+{
+	/*
+	 * Since journal size is a power of two, the block number modulus can just be extracted
+	 * from the low-order bits of the sequence.
+	 */
+	return vdo_compute_recovery_journal_block_number(journal->size, sequence);
+}
+
+/**
+ * vdo_compute_recovery_journal_check_byte() - Compute the check byte for a given sequence number.
+ * @journal: The journal.
+ * @sequence: The sequence number.
+ *
+ * Return: The check byte corresponding to the sequence number.
+ */
+static inline u8 __must_check
+vdo_compute_recovery_journal_check_byte(const struct recovery_journal *journal,
+					sequence_number_t sequence)
+{
+	/* The check byte must change with each trip around the journal. */
+	return (((sequence / journal->size) & 0x7F) | 0x80);
+}
+
+int __must_check vdo_decode_recovery_journal(struct recovery_journal_state_7_0 state,
+					     nonce_t nonce,
+					     struct vdo *vdo,
+					     struct partition *partition,
+					     u64 recovery_count,
+					     block_count_t journal_size,
+					     struct recovery_journal **journal_ptr);
+
+void vdo_free_recovery_journal(struct recovery_journal *journal);
+
+void vdo_initialize_recovery_journal_post_repair(struct recovery_journal *journal,
+						 u64 recovery_count,
+						 sequence_number_t tail,
+						 block_count_t logical_blocks_used,
+						 block_count_t block_map_data_blocks);
+
+block_count_t __must_check
+vdo_get_journal_block_map_data_blocks_used(struct recovery_journal *journal);
+
+thread_id_t __must_check vdo_get_recovery_journal_thread_id(struct recovery_journal *journal);
+
+void vdo_open_recovery_journal(struct recovery_journal *journal,
+			       struct slab_depot *depot,
+			       struct block_map *block_map);
+
+sequence_number_t
+vdo_get_recovery_journal_current_sequence_number(struct recovery_journal *journal);
+
+block_count_t __must_check vdo_get_recovery_journal_length(block_count_t journal_size);
+
+struct recovery_journal_state_7_0 __must_check
+vdo_record_recovery_journal(const struct recovery_journal *journal);
+
+void vdo_add_recovery_journal_entry(struct recovery_journal *journal, struct data_vio *data_vio);
+
+void vdo_acquire_recovery_journal_block_reference(struct recovery_journal *journal,
+						  sequence_number_t sequence_number,
+						  enum vdo_zone_type zone_type,
+						  zone_count_t zone_id);
+
+void vdo_release_recovery_journal_block_reference(struct recovery_journal *journal,
+						  sequence_number_t sequence_number,
+						  enum vdo_zone_type zone_type,
+						  zone_count_t zone_id);
+
+void vdo_release_journal_entry_lock(struct recovery_journal *journal,
+				    sequence_number_t sequence_number);
+
+void vdo_drain_recovery_journal(struct recovery_journal *journal,
+				const struct admin_state_code *operation,
+				struct vdo_completion *parent);
+
+void vdo_resume_recovery_journal(struct recovery_journal *journal,
+				 struct vdo_completion *parent);
+
+block_count_t __must_check
+vdo_get_recovery_journal_logical_blocks_used(const struct recovery_journal *journal);
+
+struct recovery_journal_statistics __must_check
+vdo_get_recovery_journal_statistics(const struct recovery_journal *journal);
+
+void vdo_dump_recovery_journal_statistics(const struct recovery_journal *journal);
+
+#endif /* VDO_RECOVERY_JOURNAL_H */
diff --git a/drivers/md/dm-vdo/release-versions.h b/drivers/md/dm-vdo/release-versions.h
new file mode 100644
index 00000000000..0abc13c04b8
--- /dev/null
+++ b/drivers/md/dm-vdo/release-versions.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef RELEASE_VERSIONS_H
+#define RELEASE_VERSIONS_H
+
+enum {
+	VDO_OXYGEN_RELEASE_VERSION_NUMBER = 109583,
+	VDO_FLUORINE_RELEASE_VERSION_NUMBER = 115838,
+	VDO_NEON_RELEASE_VERSION_NUMBER = 120965,
+	VDO_SODIUM_RELEASE_VERSION_NUMBER = 127441,
+	VDO_MAGNESIUM_RELEASE_VERSION_NUMBER = 131337,
+	VDO_ALUMINUM_RELEASE_VERSION_NUMBER = 133524,
+	VDO_HEAD_RELEASE_VERSION_NUMBER = 0,
+	VDO_CURRENT_RELEASE_VERSION_NUMBER = VDO_HEAD_RELEASE_VERSION_NUMBER,
+};
+
+#endif /* not RELEASE_VERSIONS_H */
diff --git a/drivers/md/dm-vdo/repair.c b/drivers/md/dm-vdo/repair.c
new file mode 100644
index 00000000000..455b9a89230
--- /dev/null
+++ b/drivers/md/dm-vdo/repair.c
@@ -0,0 +1,1775 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "repair.h"
+
+#include <linux/min_heap.h>
+#include <linux/minmax.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "block-map.h"
+#include "completion.h"
+#include "constants.h"
+#include "encodings.h"
+#include "int-map.h"
+#include "io-submitter.h"
+#include "recovery-journal.h"
+#include "slab-depot.h"
+#include "types.h"
+#include "vdo.h"
+#include "wait-queue.h"
+
+/*
+ * An explicitly numbered block mapping. Numbering the mappings allows them to be sorted by logical
+ * block number during repair while still preserving the relative order of journal entries with
+ * the same logical block number.
+ */
+struct numbered_block_mapping {
+	struct block_map_slot block_map_slot;
+	struct block_map_entry block_map_entry;
+	/* A serial number to use during replay */
+	u32 number;
+} __packed;
+
+/*
+ * The absolute position of an entry in the recovery journal, including the sector number and the
+ * entry number within the sector.
+ */
+struct recovery_point {
+	/* Block sequence number */
+	sequence_number_t sequence_number;
+	/* Sector number */
+	u8 sector_count;
+	/* Entry number */
+	journal_entry_count_t entry_count;
+	/* Whether or not the increment portion of the current entry has been applied */
+	bool increment_applied;
+};
+
+struct repair_completion {
+	/* The completion header */
+	struct vdo_completion completion;
+
+	/* A buffer to hold the data read off disk */
+	char *journal_data;
+
+	/* For loading the journal */
+	data_vio_count_t vio_count;
+	data_vio_count_t vios_complete;
+	struct vio *vios;
+
+	/* The number of entries to be applied to the block map */
+	size_t block_map_entry_count;
+	/* The sequence number of the first valid block for block map recovery */
+	sequence_number_t block_map_head;
+	/* The sequence number of the first valid block for slab journal replay */
+	sequence_number_t slab_journal_head;
+	/* The sequence number of the last valid block of the journal (if known) */
+	sequence_number_t tail;
+	/*
+	 * The highest sequence number of the journal. During recovery (vs read-only rebuild), not
+	 * the same as the tail, since the tail ignores blocks after the first hole.
+	 */
+	sequence_number_t highest_tail;
+
+	/* The number of logical blocks currently known to be in use */
+	block_count_t logical_blocks_used;
+	/* The number of block map data blocks known to be allocated */
+	block_count_t block_map_data_blocks;
+
+	/* These fields are for playing the journal into the block map */
+	/* The entry data for the block map recovery */
+	struct numbered_block_mapping *entries;
+	/* The number of entries in the entry array */
+	size_t entry_count;
+	/* number of pending (non-ready) requests*/
+	page_count_t outstanding;
+	/* number of page completions */
+	page_count_t page_count;
+	bool launching;
+	/*
+	 * a heap wrapping journal_entries. It re-orders and sorts journal entries in ascending LBN
+	 * order, then original journal order. This permits efficient iteration over the journal
+	 * entries in order.
+	 */
+	struct min_heap replay_heap;
+	/* Fields tracking progress through the journal entries. */
+	struct numbered_block_mapping *current_entry;
+	struct numbered_block_mapping *current_unfetched_entry;
+	/* Current requested page's PBN */
+	physical_block_number_t pbn;
+
+	/* These fields are only used during recovery. */
+	/* A location just beyond the last valid entry of the journal */
+	struct recovery_point tail_recovery_point;
+	/* The location of the next recovery journal entry to apply */
+	struct recovery_point next_recovery_point;
+	/* The journal point to give to the next synthesized decref */
+	struct journal_point next_journal_point;
+	/* The number of entries played into slab journals */
+	size_t entries_added_to_slab_journals;
+
+	/* These fields are only used during read-only rebuild */
+	page_count_t page_to_fetch;
+	/* the number of leaf pages in the block map */
+	page_count_t leaf_pages;
+	/* the last slot of the block map */
+	struct block_map_slot last_slot;
+
+	/*
+	 * The page completions used for playing the journal into the block map, and, during
+	 * read-only rebuild, for rebuilding the reference counts from the block map.
+	 */
+	struct vdo_page_completion page_completions[];
+};
+
+/*
+ * This is a min_heap callback function that orders numbered_block_mappings using the
+ * 'block_map_slot' field as the primary key and the mapping 'number' field as the secondary key.
+ * Using the mapping number preserves the journal order of entries for the same slot, allowing us
+ * to sort by slot while still ensuring we replay all entries with the same slot in the exact order
+ * as they appeared in the journal.
+ */
+static bool mapping_is_less_than(const void *item1, const void *item2)
+{
+	const struct numbered_block_mapping *mapping1 =
+		(const struct numbered_block_mapping *) item1;
+	const struct numbered_block_mapping *mapping2 =
+		(const struct numbered_block_mapping *) item2;
+
+	if (mapping1->block_map_slot.pbn != mapping2->block_map_slot.pbn)
+		return mapping1->block_map_slot.pbn < mapping2->block_map_slot.pbn;
+
+	if (mapping1->block_map_slot.slot != mapping2->block_map_slot.slot)
+		return mapping1->block_map_slot.slot < mapping2->block_map_slot.slot;
+
+	if (mapping1->number != mapping2->number)
+		return mapping1->number < mapping2->number;
+
+	return 0;
+}
+
+static void swap_mappings(void *item1, void *item2)
+{
+	struct numbered_block_mapping *mapping1 = item1;
+	struct numbered_block_mapping *mapping2 = item2;
+
+	swap(*mapping1, *mapping2);
+}
+
+static const struct min_heap_callbacks repair_min_heap = {
+	.elem_size = sizeof(struct numbered_block_mapping),
+	.less = mapping_is_less_than,
+	.swp = swap_mappings,
+};
+
+static struct numbered_block_mapping *
+sort_next_heap_element(struct repair_completion *repair)
+{
+	struct min_heap *heap = &repair->replay_heap;
+	struct numbered_block_mapping *last;
+
+	if (heap->nr == 0)
+		return NULL;
+
+	/*
+	 * Swap the next heap element with the last one on the heap, popping it off the heap,
+	 * restore the heap invariant, and return a pointer to the popped element.
+	 */
+	last = &repair->entries[--heap->nr];
+	swap_mappings(heap->data, last);
+	min_heapify(heap, 0, &repair_min_heap);
+	return last;
+}
+
+/**
+ * as_repair_completion() - Convert a generic completion to a repair_completion.
+ * @completion: The completion to convert.
+ *
+ * Return: The repair_completion.
+ */
+static inline struct repair_completion * __must_check
+as_repair_completion(struct vdo_completion *completion)
+{
+	vdo_assert_completion_type(completion, VDO_REPAIR_COMPLETION);
+	return container_of(completion, struct repair_completion, completion);
+}
+
+static void prepare_repair_completion(struct repair_completion *repair,
+				      vdo_action *callback,
+				      enum vdo_zone_type zone_type)
+{
+	struct vdo_completion *completion = &repair->completion;
+	const struct thread_config *thread_config = &completion->vdo->thread_config;
+	thread_id_t thread_id;
+
+	/* All blockmap access is done on single thread, so use logical zone 0. */
+	thread_id = ((zone_type == VDO_ZONE_TYPE_LOGICAL) ?
+		     thread_config->logical_threads[0] :
+		     thread_config->admin_thread);
+	vdo_reset_completion(completion);
+	vdo_set_completion_callback(completion, callback, thread_id);
+}
+
+static void launch_repair_completion(struct repair_completion *repair,
+				     vdo_action *callback,
+				     enum vdo_zone_type zone_type)
+{
+	prepare_repair_completion(repair, callback, zone_type);
+	vdo_launch_completion(&repair->completion);
+}
+
+static void uninitialize_vios(struct repair_completion *repair)
+{
+	while (repair->vio_count > 0)
+		free_vio_components(&repair->vios[--repair->vio_count]);
+
+	UDS_FREE(UDS_FORGET(repair->vios));
+}
+
+static void free_repair_completion(struct repair_completion *repair)
+{
+	if (repair == NULL)
+		return;
+
+	/*
+	 * We do this here because this function is the only common bottleneck for all clean up
+	 * paths.
+	 */
+	repair->completion.vdo->block_map->zones[0].page_cache.rebuilding = false;
+
+	uninitialize_vios(repair);
+	UDS_FREE(UDS_FORGET(repair->journal_data));
+	UDS_FREE(UDS_FORGET(repair->entries));
+	UDS_FREE(repair);
+}
+
+static void finish_repair(struct vdo_completion *completion)
+{
+	struct vdo_completion *parent = completion->parent;
+	struct vdo *vdo = completion->vdo;
+	struct repair_completion *repair = as_repair_completion(completion);
+
+	vdo_assert_on_admin_thread(vdo, __func__);
+
+	if (vdo->load_state != VDO_REBUILD_FOR_UPGRADE)
+		vdo->states.vdo.complete_recoveries++;
+
+	vdo_initialize_recovery_journal_post_repair(vdo->recovery_journal,
+						    vdo->states.vdo.complete_recoveries,
+						    repair->highest_tail,
+						    repair->logical_blocks_used,
+						    repair->block_map_data_blocks);
+	free_repair_completion(UDS_FORGET(repair));
+
+	if (vdo_state_requires_read_only_rebuild(vdo->load_state)) {
+		uds_log_info("Read-only rebuild complete");
+		vdo_launch_completion(parent);
+		return;
+	}
+
+	/* FIXME: shouldn't this say either "recovery" or "repair"? */
+	uds_log_info("Rebuild complete");
+
+	/*
+	 * Now that we've freed the repair completion and its vast array of journal entries, we
+	 * can allocate refcounts.
+	 */
+	vdo_continue_completion(parent, vdo_allocate_reference_counters(vdo->depot));
+}
+
+/**
+ * abort_repair() - Handle a repair error.
+ * @completion: The repair completion.
+ */
+static void abort_repair(struct vdo_completion *completion)
+{
+	struct vdo_completion *parent = completion->parent;
+	int result = completion->result;
+	struct repair_completion *repair = as_repair_completion(completion);
+
+	if (vdo_state_requires_read_only_rebuild(completion->vdo->load_state))
+		uds_log_info("Read-only rebuild aborted");
+	else
+		uds_log_warning("Recovery aborted");
+
+	free_repair_completion(UDS_FORGET(repair));
+	vdo_continue_completion(parent, result);
+}
+
+/**
+ * abort_on_error() - Abort a repair if there is an error.
+ * @result: The result to check.
+ * @repair: The repair completion.
+ *
+ * Return: true if the result was an error.
+ */
+static bool __must_check abort_on_error(int result, struct repair_completion *repair)
+{
+	if (result == VDO_SUCCESS)
+		return false;
+
+	vdo_fail_completion(&repair->completion, result);
+	return true;
+}
+
+/**
+ * drain_slab_depot() - Flush out all dirty refcounts blocks now that they have been rebuilt or
+ *                      recovered.
+ */
+static void drain_slab_depot(struct vdo_completion *completion)
+{
+	struct vdo *vdo = completion->vdo;
+	struct repair_completion *repair = as_repair_completion(completion);
+	const struct admin_state_code *operation;
+
+	vdo_assert_on_admin_thread(vdo, __func__);
+
+	prepare_repair_completion(repair, finish_repair, VDO_ZONE_TYPE_ADMIN);
+	if (vdo_state_requires_read_only_rebuild(vdo->load_state)) {
+		uds_log_info("Saving rebuilt state");
+		operation = VDO_ADMIN_STATE_REBUILDING;
+	} else {
+		uds_log_info("Replayed %zu journal entries into slab journals",
+			     repair->entries_added_to_slab_journals);
+		operation = VDO_ADMIN_STATE_RECOVERING;
+	}
+
+	vdo_drain_slab_depot(vdo->depot, operation, completion);
+}
+
+/**
+ * flush_block_map_updates() - Flush the block map now that all the reference counts are rebuilt.
+ * @completion: The repair completion.
+ *
+ * This callback is registered in finish_if_done().
+ */
+static void flush_block_map_updates(struct vdo_completion *completion)
+{
+	vdo_assert_on_admin_thread(completion->vdo, __func__);
+
+	uds_log_info("Flushing block map changes");
+	prepare_repair_completion(as_repair_completion(completion),
+				  drain_slab_depot,
+				  VDO_ZONE_TYPE_ADMIN);
+	vdo_drain_block_map(completion->vdo->block_map, VDO_ADMIN_STATE_RECOVERING, completion);
+}
+
+static bool fetch_page(struct repair_completion *repair, struct vdo_completion *completion);
+
+/**
+ * handle_page_load_error() - Handle an error loading a page.
+ * @completion: The vdo_page_completion.
+ */
+static void handle_page_load_error(struct vdo_completion *completion)
+{
+	struct repair_completion *repair = completion->parent;
+
+	repair->outstanding--;
+	vdo_set_completion_result(&repair->completion, completion->result);
+	vdo_release_page_completion(completion);
+	fetch_page(repair, completion);
+}
+
+/**
+ * Unmap an invalid entry and indicate that its page must be written out.
+ * @page: The page containing the entries
+ * @completion: The page_completion for writing the page
+ * @slot: The slot to unmap
+ */
+static void
+unmap_entry(struct block_map_page *page, struct vdo_completion *completion, slot_number_t slot)
+{
+	page->entries[slot] = vdo_pack_block_map_entry(VDO_ZERO_BLOCK, VDO_MAPPING_STATE_UNMAPPED);
+	vdo_request_page_write(completion);
+}
+
+/**
+ * remove_out_of_bounds_entries(): Unmap entries which outside the logical space.
+ * @page: The page containing the entries
+ * @completion: The page_completion for writing the page
+ * @start: The first slot to check
+ */
+static void remove_out_of_bounds_entries(struct block_map_page *page,
+					 struct vdo_completion *completion,
+					 slot_number_t start)
+{
+	slot_number_t slot;
+
+	for (slot = start; slot < VDO_BLOCK_MAP_ENTRIES_PER_PAGE; slot++) {
+		struct data_location mapping = vdo_unpack_block_map_entry(&page->entries[slot]);
+
+		if (vdo_is_mapped_location(&mapping))
+			unmap_entry(page, completion, slot);
+	}
+}
+
+/**
+ * process_slot(): Update the reference counts for a single entry.
+ * @page: The page containing the entries
+ * @completion: The page_completion for writing the page
+ * @slot: The slot to check
+ *
+ * Return: true if the entry was a valid mapping
+ */
+static bool
+process_slot(struct block_map_page *page, struct vdo_completion *completion, slot_number_t slot)
+{
+	struct slab_depot *depot = completion->vdo->depot;
+	int result;
+	struct data_location mapping = vdo_unpack_block_map_entry(&page->entries[slot]);
+
+	if (!vdo_is_valid_location(&mapping)) {
+		/* This entry is invalid, so remove it from the page. */
+		unmap_entry(page, completion, slot);
+		return false;
+	}
+
+	if (!vdo_is_mapped_location(&mapping))
+		return false;
+
+
+	if (mapping.pbn == VDO_ZERO_BLOCK)
+		return true;
+
+	if (!vdo_is_physical_data_block(depot, mapping.pbn)) {
+		/*
+		 * This is a nonsense mapping. Remove it from the map so we're at least consistent
+		 * and mark the page dirty.
+		 */
+		unmap_entry(page, completion, slot);
+		return false;
+	}
+
+	result = vdo_adjust_reference_count_for_rebuild(depot,
+							mapping.pbn,
+							VDO_JOURNAL_DATA_REMAPPING);
+	if (result == VDO_SUCCESS)
+		return true;
+
+	uds_log_error_strerror(result,
+			       "Could not adjust reference count for PBN %llu, slot %u mapped to PBN %llu",
+			       (unsigned long long) vdo_get_block_map_page_pbn(page),
+			       slot,
+			       (unsigned long long) mapping.pbn);
+	unmap_entry(page, completion, slot);
+	return false;
+}
+
+/**
+ * rebuild_reference_counts_from_page() - Rebuild reference counts from a block map page.
+ * @repair: The repair completion.
+ * @completion: The page completion holding the page.
+ */
+static void rebuild_reference_counts_from_page(struct repair_completion *repair,
+					       struct vdo_completion *completion)
+{
+	slot_number_t slot, last_slot;
+	struct block_map_page *page;
+	int result;
+
+	result = vdo_get_cached_page(completion, &page);
+	if (result != VDO_SUCCESS) {
+		vdo_set_completion_result(&repair->completion, result);
+		return;
+	}
+
+	if (!page->header.initialized)
+		return;
+
+	/* Remove any bogus entries which exist beyond the end of the logical space. */
+	if (vdo_get_block_map_page_pbn(page) == repair->last_slot.pbn) {
+		last_slot = repair->last_slot.slot;
+		remove_out_of_bounds_entries(page, completion, last_slot);
+	} else {
+		last_slot = VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
+	}
+
+	/* Inform the slab depot of all entries on this page. */
+	for (slot = 0; slot < last_slot; slot++) {
+		if (process_slot(page, completion, slot))
+			repair->logical_blocks_used++;
+	}
+}
+
+/**
+ * page_loaded() - Process a page which has just been loaded.
+ * @completion: The vdo_page_completion for the fetched page.
+ *
+ * This callback is registered by fetch_page().
+ */
+static void page_loaded(struct vdo_completion *completion)
+{
+	struct repair_completion *repair = completion->parent;
+
+	repair->outstanding--;
+	rebuild_reference_counts_from_page(repair, completion);
+	vdo_release_page_completion(completion);
+
+	/* Advance progress to the next page, and fetch the next page we haven't yet requested. */
+	fetch_page(repair, completion);
+}
+
+static physical_block_number_t
+get_pbn_to_fetch(struct repair_completion *repair, struct block_map *block_map)
+{
+	physical_block_number_t pbn = VDO_ZERO_BLOCK;
+
+	if (repair->completion.result != VDO_SUCCESS)
+		return VDO_ZERO_BLOCK;
+
+	while ((pbn == VDO_ZERO_BLOCK) && (repair->page_to_fetch < repair->leaf_pages))
+		pbn = vdo_find_block_map_page_pbn(block_map, repair->page_to_fetch++);
+
+	if (vdo_is_physical_data_block(repair->completion.vdo->depot, pbn))
+		return pbn;
+
+	vdo_set_completion_result(&repair->completion, VDO_BAD_MAPPING);
+	return VDO_ZERO_BLOCK;
+}
+
+/**
+ * fetch_page() - Fetch a page from the block map.
+ * @repair: The repair_completion.
+ * @completion: The page completion to use.
+ *
+ * Return true if the rebuild is complete
+ */
+static bool fetch_page(struct repair_completion *repair, struct vdo_completion *completion)
+{
+	struct vdo_page_completion *page_completion = (struct vdo_page_completion *) completion;
+	struct block_map *block_map = repair->completion.vdo->block_map;
+	physical_block_number_t pbn = get_pbn_to_fetch(repair, block_map);
+
+	if (pbn != VDO_ZERO_BLOCK) {
+		repair->outstanding++;
+		/*
+		 * We must set the requeue flag here to ensure that we don't blow the stack if all
+		 * the requested pages are already in the cache or get load errors.
+		 */
+		vdo_get_page(page_completion,
+			     &block_map->zones[0],
+			     pbn,
+			     true,
+			     repair,
+			     page_loaded,
+			     handle_page_load_error,
+			     true);
+	}
+
+	if (repair->outstanding > 0)
+		return false;
+
+	launch_repair_completion(repair, flush_block_map_updates, VDO_ZONE_TYPE_ADMIN);
+	return true;
+}
+
+/**
+ * rebuild_from_leaves() - Rebuild reference counts from the leaf block map pages.
+ * @completion: The repair completion.
+ *
+ * Rebuilds reference counts from the leaf block map pages now that reference counts have been
+ * rebuilt from the interior tree pages (which have been loaded in the process). This callback is
+ * registered in rebuild_reference_counts().
+ */
+static void rebuild_from_leaves(struct vdo_completion *completion)
+{
+	page_count_t i;
+	struct repair_completion *repair = as_repair_completion(completion);
+	struct block_map *map = completion->vdo->block_map;
+
+	repair->logical_blocks_used = 0;
+
+	/*
+	 * The PBN calculation doesn't work until the tree pages have been loaded, so we can't set
+	 * this value at the start of repair.
+	 */
+	repair->leaf_pages = vdo_compute_block_map_page_count(map->entry_count);
+	repair->last_slot = (struct block_map_slot) {
+		.slot = map->entry_count % VDO_BLOCK_MAP_ENTRIES_PER_PAGE,
+		.pbn = vdo_find_block_map_page_pbn(map, repair->leaf_pages - 1),
+	};
+	if (repair->last_slot.slot == 0)
+		repair->last_slot.slot = VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
+
+	for (i = 0; i < repair->page_count; i++) {
+		if (fetch_page(repair, &repair->page_completions[i].completion))
+			/*
+			 * The rebuild has already moved on, so it isn't safe nor is there a need
+			 * to launch any more fetches.
+			 */
+			return;
+	}
+}
+
+/**
+ * process_entry() - Process a single entry from the block map tree.
+ * @pbn: A pbn which holds a block map tree page.
+ * @completion: The parent completion of the traversal.
+ *
+ * Implements vdo_entry_callback.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int process_entry(physical_block_number_t pbn, struct vdo_completion *completion)
+{
+	struct repair_completion *repair = as_repair_completion(completion);
+	struct slab_depot *depot = completion->vdo->depot;
+	int result;
+
+	if ((pbn == VDO_ZERO_BLOCK) || !vdo_is_physical_data_block(depot, pbn))
+		return uds_log_error_strerror(VDO_BAD_CONFIGURATION,
+					      "PBN %llu out of range",
+					      (unsigned long long) pbn);
+
+	result = vdo_adjust_reference_count_for_rebuild(depot,
+							pbn,
+							VDO_JOURNAL_BLOCK_MAP_REMAPPING);
+	if (result != VDO_SUCCESS)
+		return uds_log_error_strerror(result,
+					      "Could not adjust reference count for block map tree PBN %llu",
+					      (unsigned long long) pbn);
+
+	repair->block_map_data_blocks++;
+	return VDO_SUCCESS;
+}
+
+static void rebuild_reference_counts(struct vdo_completion *completion)
+{
+	struct repair_completion *repair = as_repair_completion(completion);
+	struct vdo *vdo = completion->vdo;
+	struct vdo_page_cache *cache = &vdo->block_map->zones[0].page_cache;
+
+	/* We must allocate ref_counts before we can rebuild them. */
+	if (abort_on_error(vdo_allocate_reference_counters(vdo->depot), repair))
+		return;
+
+	/*
+	 * Completion chaining from page cache hits can lead to stack overflow during the rebuild,
+	 * so clear out the cache before this rebuild phase.
+	 */
+	if (abort_on_error(vdo_invalidate_page_cache(cache), repair))
+		return;
+
+	prepare_repair_completion(repair, rebuild_from_leaves, VDO_ZONE_TYPE_LOGICAL);
+	vdo_traverse_forest(vdo->block_map, process_entry, completion);
+}
+
+/**
+ * increment_recovery_point() - Move the given recovery point forward by one entry.
+ */
+static void increment_recovery_point(struct recovery_point *point)
+{
+	if (++point->entry_count < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR)
+		return;
+
+	point->entry_count = 0;
+	if (point->sector_count < (VDO_SECTORS_PER_BLOCK - 1)) {
+		point->sector_count++;
+		return;
+	}
+
+	point->sequence_number++;
+	point->sector_count = 1;
+}
+
+/**
+ * advance_points() - Advance the current recovery and journal points.
+ * @repair: The repair_completion whose points are to be advanced.
+ * @entries_per_block: The number of entries in a recovery journal block.
+ */
+static void
+advance_points(struct repair_completion *repair, journal_entry_count_t entries_per_block)
+{
+	if (!repair->next_recovery_point.increment_applied) {
+		repair->next_recovery_point.increment_applied	= true;
+		return;
+	}
+
+	increment_recovery_point(&repair->next_recovery_point);
+	vdo_advance_journal_point(&repair->next_journal_point, entries_per_block);
+	repair->next_recovery_point.increment_applied	= false;
+}
+
+/**
+ * before_recovery_point() - Check whether the first point precedes the second point.
+ * @first: The first recovery point.
+ * @second: The second recovery point.
+ *
+ * Return: true if the first point precedes the second point.
+ */
+static bool __must_check
+before_recovery_point(const struct recovery_point *first, const struct recovery_point *second)
+{
+	if (first->sequence_number < second->sequence_number)
+		return true;
+
+	if (first->sequence_number > second->sequence_number)
+		return false;
+
+	if (first->sector_count < second->sector_count)
+		return true;
+
+	return ((first->sector_count == second->sector_count) &&
+		(first->entry_count < second->entry_count));
+}
+
+static struct packed_journal_sector * __must_check
+get_sector(struct recovery_journal *journal,
+	   char *journal_data,
+	   sequence_number_t sequence,
+	   u8 sector_number)
+{
+	off_t offset;
+
+	offset = ((vdo_get_recovery_journal_block_number(journal, sequence) * VDO_BLOCK_SIZE) +
+		  (VDO_SECTOR_SIZE * sector_number));
+	return (struct packed_journal_sector *) (journal_data + offset);
+}
+
+/**
+ * get_entry() - Unpack the recovery journal entry associated with the given recovery point.
+ * @repair: The repair completion.
+ * @point: The recovery point.
+ *
+ * Return: The unpacked contents of the matching recovery journal entry.
+ */
+static struct recovery_journal_entry
+get_entry(const struct repair_completion *repair, const struct recovery_point *point)
+{
+	struct packed_journal_sector *sector;
+
+	sector = get_sector(repair->completion.vdo->recovery_journal,
+			    repair->journal_data,
+			    point->sequence_number,
+			    point->sector_count);
+	return vdo_unpack_recovery_journal_entry(&sector->entries[point->entry_count]);
+}
+
+/**
+ * validate_recovery_journal_entry() - Validate a recovery journal entry.
+ * @vdo: The vdo.
+ * @entry: The entry to validate.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int
+validate_recovery_journal_entry(const struct vdo *vdo, const struct recovery_journal_entry *entry)
+{
+	if ((entry->slot.pbn >= vdo->states.vdo.config.physical_blocks) ||
+	    (entry->slot.slot >= VDO_BLOCK_MAP_ENTRIES_PER_PAGE) ||
+	    !vdo_is_valid_location(&entry->mapping) ||
+	    !vdo_is_valid_location(&entry->unmapping) ||
+	    !vdo_is_physical_data_block(vdo->depot, entry->mapping.pbn) ||
+	    !vdo_is_physical_data_block(vdo->depot, entry->unmapping.pbn))
+		return uds_log_error_strerror(VDO_CORRUPT_JOURNAL,
+					      "Invalid entry: %s (%llu, %u) from %llu to %llu is not within bounds",
+					      vdo_get_journal_operation_name(entry->operation),
+					      (unsigned long long) entry->slot.pbn,
+					      entry->slot.slot,
+					      (unsigned long long) entry->unmapping.pbn,
+					      (unsigned long long) entry->mapping.pbn);
+
+	if ((entry->operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) &&
+	    (vdo_is_state_compressed(entry->mapping.state) ||
+	     (entry->mapping.pbn == VDO_ZERO_BLOCK) ||
+	     (entry->unmapping.state != VDO_MAPPING_STATE_UNMAPPED) ||
+	     (entry->unmapping.pbn != VDO_ZERO_BLOCK)))
+		return uds_log_error_strerror(VDO_CORRUPT_JOURNAL,
+					      "Invalid entry: %s (%llu, %u) from %llu to %llu is not a valid tree mapping",
+					      vdo_get_journal_operation_name(entry->operation),
+					      (unsigned long long) entry->slot.pbn,
+					      entry->slot.slot,
+					      (unsigned long long) entry->unmapping.pbn,
+					      (unsigned long long) entry->mapping.pbn);
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * add_slab_journal_entries() - Replay recovery journal entries into the slab journals of the
+ *                              allocator currently being recovered.
+ * @completion: The allocator completion.
+ *
+ * Waits for slab journal tailblock space when necessary. This method is its own callback.
+ */
+static void add_slab_journal_entries(struct vdo_completion *completion)
+{
+	struct recovery_point *recovery_point;
+	struct repair_completion *repair = completion->parent;
+	struct vdo *vdo = completion->vdo;
+	struct recovery_journal *journal = vdo->recovery_journal;
+	struct block_allocator *allocator = vdo_as_block_allocator(completion);
+
+	/* Get ready in case we need to enqueue again. */
+	vdo_prepare_completion(completion,
+			       add_slab_journal_entries,
+			       vdo_notify_slab_journals_are_recovered,
+			       completion->callback_thread_id,
+			       repair);
+	for (recovery_point = &repair->next_recovery_point;
+	     before_recovery_point(recovery_point, &repair->tail_recovery_point);
+	     advance_points(repair, journal->entries_per_block)) {
+		int result;
+		physical_block_number_t pbn;
+		struct vdo_slab *slab;
+		struct recovery_journal_entry entry = get_entry(repair, recovery_point);
+		bool increment = !repair->next_recovery_point.increment_applied;
+
+		if (increment) {
+			result = validate_recovery_journal_entry(vdo, &entry);
+			if (result != VDO_SUCCESS) {
+				vdo_enter_read_only_mode(vdo, result);
+				vdo_fail_completion(completion, result);
+				return;
+			}
+
+			pbn = entry.mapping.pbn;
+		} else {
+			pbn = entry.unmapping.pbn;
+		}
+
+		if (pbn == VDO_ZERO_BLOCK)
+			continue;
+
+		slab = vdo_get_slab(vdo->depot, pbn);
+		if (slab->allocator != allocator)
+			continue;
+
+		if (!vdo_attempt_replay_into_slab(slab,
+						  pbn,
+						  entry.operation,
+						  increment,
+						  &repair->next_journal_point,
+						  completion))
+			return;
+
+		repair->entries_added_to_slab_journals++;
+	}
+
+	vdo_notify_slab_journals_are_recovered(completion);
+}
+
+/**
+ * vdo_replay_into_slab_journals() - Replay recovery journal entries in the slab journals of slabs
+ *                                   owned by a given block_allocator.
+ * @allocator: The allocator whose slab journals are to be recovered.
+ * @context: The slab depot load context supplied by a recovery when it loads the depot.
+ */
+void vdo_replay_into_slab_journals(struct block_allocator *allocator, void *context)
+{
+	struct vdo_completion *completion = &allocator->completion;
+	struct repair_completion *repair = context;
+	struct vdo *vdo = completion->vdo;
+
+	vdo_assert_on_physical_zone_thread(vdo, allocator->zone_number, __func__);
+	if (repair->entry_count == 0) {
+		/* there's nothing to replay */
+		repair->logical_blocks_used = vdo->recovery_journal->logical_blocks_used;
+		repair->block_map_data_blocks = vdo->recovery_journal->block_map_data_blocks;
+		vdo_notify_slab_journals_are_recovered(completion);
+		return;
+	}
+
+	repair->next_recovery_point = (struct recovery_point) {
+		.sequence_number = repair->slab_journal_head,
+		.sector_count = 1,
+		.entry_count = 0,
+	};
+
+	repair->next_journal_point = (struct journal_point) {
+		.sequence_number = repair->slab_journal_head,
+		.entry_count = 0,
+	};
+
+	uds_log_info("Replaying entries into slab journals for zone %u", allocator->zone_number);
+	completion->parent = repair;
+	add_slab_journal_entries(completion);
+}
+
+static void load_slab_depot(struct vdo_completion *completion)
+{
+	struct repair_completion *repair = as_repair_completion(completion);
+	const struct admin_state_code *operation;
+
+	vdo_assert_on_admin_thread(completion->vdo, __func__);
+
+	if (vdo_state_requires_read_only_rebuild(completion->vdo->load_state)) {
+		prepare_repair_completion(repair, rebuild_reference_counts, VDO_ZONE_TYPE_LOGICAL);
+		operation = VDO_ADMIN_STATE_LOADING_FOR_REBUILD;
+	} else {
+		prepare_repair_completion(repair, drain_slab_depot, VDO_ZONE_TYPE_ADMIN);
+		operation = VDO_ADMIN_STATE_LOADING_FOR_RECOVERY;
+	}
+
+	vdo_load_slab_depot(completion->vdo->depot, operation, completion, repair);
+}
+
+static void flush_block_map(struct vdo_completion *completion)
+{
+	struct repair_completion *repair = as_repair_completion(completion);
+	const struct admin_state_code *operation;
+
+	vdo_assert_on_admin_thread(completion->vdo, __func__);
+
+	uds_log_info("Flushing block map changes");
+	prepare_repair_completion(repair, load_slab_depot, VDO_ZONE_TYPE_ADMIN);
+	operation = (vdo_state_requires_read_only_rebuild(completion->vdo->load_state) ?
+		     VDO_ADMIN_STATE_REBUILDING :
+		     VDO_ADMIN_STATE_RECOVERING);
+	vdo_drain_block_map(completion->vdo->block_map, operation, completion);
+}
+
+static bool finish_if_done(struct repair_completion *repair)
+{
+	/* Pages are still being launched or there is still work to do */
+	if (repair->launching || (repair->outstanding > 0))
+		return false;
+
+	if (repair->completion.result != VDO_SUCCESS) {
+		page_count_t i;
+
+		for (i = 0; i < repair->page_count; i++) {
+			struct vdo_page_completion *page_completion =
+				&repair->page_completions[i];
+
+			if (page_completion->ready)
+				vdo_release_page_completion(&page_completion->completion);
+		}
+
+		vdo_launch_completion(&repair->completion);
+		return true;
+	}
+
+	if (repair->current_entry >= repair->entries)
+		return false;
+
+	launch_repair_completion(repair, flush_block_map, VDO_ZONE_TYPE_ADMIN);
+	return true;
+}
+
+static void abort_block_map_recovery(struct repair_completion *repair, int result)
+{
+	vdo_set_completion_result(&repair->completion, result);
+	finish_if_done(repair);
+}
+
+/**
+ * find_entry_starting_next_page() - Find the first journal entry after a given entry which is not
+ *                                   on the same block map page.
+ * @current_entry: The entry to search from.
+ * @needs_sort: Whether sorting is needed to proceed.
+ *
+ * Return: Pointer to the first later journal entry on a different block map page, or a pointer to
+ *         just before the journal entries if no subsequent entry is on a different block map page.
+ */
+static struct numbered_block_mapping *
+find_entry_starting_next_page(struct repair_completion *repair,
+			      struct numbered_block_mapping *current_entry,
+			      bool needs_sort)
+{
+	size_t current_page;
+
+	/* If current_entry is invalid, return immediately. */
+	if (current_entry < repair->entries)
+		return current_entry;
+
+	current_page = current_entry->block_map_slot.pbn;
+
+	/* Decrement current_entry until it's out of bounds or on a different page. */
+	while ((current_entry >= repair->entries) &&
+	       (current_entry->block_map_slot.pbn == current_page)) {
+		if (needs_sort) {
+			struct numbered_block_mapping *just_sorted_entry =
+				sort_next_heap_element(repair);
+			ASSERT_LOG_ONLY(just_sorted_entry < current_entry,
+					"heap is returning elements in an unexpected order");
+		}
+
+		current_entry--;
+	}
+
+	return current_entry;
+}
+
+/*
+ * Apply a range of journal entries [starting_entry, ending_entry) journal
+ * entries to a block map page.
+ */
+static void apply_journal_entries_to_page(struct block_map_page *page,
+					  struct numbered_block_mapping *starting_entry,
+					  struct numbered_block_mapping *ending_entry)
+{
+	struct numbered_block_mapping *current_entry = starting_entry;
+
+	while (current_entry != ending_entry) {
+		page->entries[current_entry->block_map_slot.slot] = current_entry->block_map_entry;
+		current_entry--;
+	}
+}
+
+static void recover_ready_pages(struct repair_completion *repair,
+				struct vdo_completion *completion);
+
+static void block_map_page_loaded(struct vdo_completion *completion)
+{
+	struct repair_completion *repair = as_repair_completion(completion->parent);
+
+	repair->outstanding--;
+	if (!repair->launching)
+		recover_ready_pages(repair, completion);
+}
+
+static void handle_block_map_page_load_error(struct vdo_completion *completion)
+{
+	struct repair_completion *repair = as_repair_completion(completion->parent);
+
+	repair->outstanding--;
+	abort_block_map_recovery(repair, completion->result);
+}
+
+static void fetch_block_map_page(struct repair_completion *repair,
+				 struct vdo_completion *completion)
+{
+	physical_block_number_t pbn;
+
+	if (repair->current_unfetched_entry < repair->entries)
+		/* Nothing left to fetch. */
+		return;
+
+	/* Fetch the next page we haven't yet requested. */
+	pbn = repair->current_unfetched_entry->block_map_slot.pbn;
+	repair->current_unfetched_entry =
+		find_entry_starting_next_page(repair, repair->current_unfetched_entry, true);
+	repair->outstanding++;
+	vdo_get_page(((struct vdo_page_completion *) completion),
+		     &repair->completion.vdo->block_map->zones[0],
+		     pbn,
+		     true,
+		     &repair->completion,
+		     block_map_page_loaded,
+		     handle_block_map_page_load_error,
+		     false);
+}
+
+static struct vdo_page_completion *
+get_next_page_completion(struct repair_completion *repair, struct vdo_page_completion *completion)
+{
+	completion++;
+	if (completion == (&repair->page_completions[repair->page_count]))
+		completion = &repair->page_completions[0];
+	return completion;
+}
+
+static void recover_ready_pages(struct repair_completion *repair,
+				struct vdo_completion *completion)
+{
+	struct vdo_page_completion *page_completion = (struct vdo_page_completion *) completion;
+
+	if (finish_if_done(repair))
+		return;
+
+	if (repair->pbn != page_completion->pbn)
+		return;
+
+	while (page_completion->ready) {
+		struct numbered_block_mapping *start_of_next_page;
+		struct block_map_page *page;
+		int result;
+
+		result = vdo_get_cached_page(completion, &page);
+		if (result != VDO_SUCCESS) {
+			abort_block_map_recovery(repair, result);
+			return;
+		}
+
+		start_of_next_page =
+			find_entry_starting_next_page(repair, repair->current_entry, false);
+		apply_journal_entries_to_page(page, repair->current_entry, start_of_next_page);
+		repair->current_entry = start_of_next_page;
+		vdo_request_page_write(completion);
+		vdo_release_page_completion(completion);
+
+		if (finish_if_done(repair))
+			return;
+
+		repair->pbn = repair->current_entry->block_map_slot.pbn;
+		fetch_block_map_page(repair, completion);
+		page_completion = get_next_page_completion(repair, page_completion);
+		completion = &page_completion->completion;
+	}
+}
+
+static void recover_block_map(struct vdo_completion *completion)
+{
+	struct repair_completion *repair = as_repair_completion(completion);
+	struct vdo *vdo = completion->vdo;
+	struct numbered_block_mapping *first_sorted_entry;
+	page_count_t i;
+
+	vdo_assert_on_logical_zone_thread(vdo, 0, __func__);
+
+	/* Suppress block map errors. */
+	vdo->block_map->zones[0].page_cache.rebuilding =
+		vdo_state_requires_read_only_rebuild(vdo->load_state);
+
+	if (repair->block_map_entry_count == 0) {
+		uds_log_info("Replaying 0 recovery entries into block map");
+		UDS_FREE(UDS_FORGET(repair->journal_data));
+		launch_repair_completion(repair, load_slab_depot, VDO_ZONE_TYPE_ADMIN);
+		return;
+	}
+
+	/*
+	 * Organize the journal entries into a binary heap so we can iterate over them in sorted
+	 * order incrementally, avoiding an expensive sort call.
+	 */
+	repair->replay_heap = (struct min_heap) {
+		.data = repair->entries,
+		.nr = repair->block_map_entry_count,
+		.size = repair->block_map_entry_count,
+	};
+	min_heapify_all(&repair->replay_heap, &repair_min_heap);
+
+	uds_log_info("Replaying %zu recovery entries into block map",
+		     repair->block_map_entry_count);
+
+	repair->current_entry = &repair->entries[repair->block_map_entry_count - 1];
+	first_sorted_entry = sort_next_heap_element(repair);
+	ASSERT_LOG_ONLY(first_sorted_entry == repair->current_entry,
+			"heap is returning elements in an unexpected order");
+
+	/* Prevent any page from being processed until all pages have been launched. */
+	repair->launching = true;
+	repair->pbn = repair->current_entry->block_map_slot.pbn;
+	repair->current_unfetched_entry = repair->current_entry;
+	for (i = 0; i < repair->page_count; i++) {
+		if (repair->current_unfetched_entry < repair->entries)
+			break;
+
+		fetch_block_map_page(repair, &repair->page_completions[i].completion);
+	}
+	repair->launching = false;
+
+	/* Process any ready pages. */
+	recover_ready_pages(repair, &repair->page_completions[0].completion);
+}
+
+/**
+ * get_recovery_journal_block_header() - Get the block header for a block at a position in the
+ *                                       journal data and unpack it.
+ * @journal: The recovery journal.
+ * @data: The recovery journal data.
+ * @sequence: The sequence number.
+ *
+ * Return: The unpacked header.
+ */
+static struct recovery_block_header __must_check
+get_recovery_journal_block_header(struct recovery_journal *journal,
+				  char *data,
+				  sequence_number_t sequence)
+{
+	physical_block_number_t pbn = vdo_get_recovery_journal_block_number(journal, sequence);
+	char *header = &data[pbn * VDO_BLOCK_SIZE];
+
+	return vdo_unpack_recovery_block_header((struct packed_journal_header *) header);
+}
+
+/**
+ * is_valid_recovery_journal_block() - Determine whether the given header describes a valid block
+ *                                     for the given journal.
+ * @journal: The journal to use.
+ * @header: The unpacked block header to check.
+ * @old_ok: Whether an old format header is valid.
+ *
+ * A block is not valid if it is unformatted, or if it is older than the last successful recovery
+ * or reformat.
+ *
+ * Return: True if the header is valid.
+ */
+static bool __must_check
+is_valid_recovery_journal_block(const struct recovery_journal *journal,
+				const struct recovery_block_header *header,
+				bool old_ok)
+{
+	if ((header->nonce != journal->nonce) ||
+	    (header->recovery_count != journal->recovery_count))
+		return false;
+
+	if (header->metadata_type == VDO_METADATA_RECOVERY_JOURNAL_2)
+		return (header->entry_count <= journal->entries_per_block);
+
+	return (old_ok &&
+		(header->metadata_type == VDO_METADATA_RECOVERY_JOURNAL) &&
+		(header->entry_count <= RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK));
+}
+
+/**
+ * is_exact_recovery_journal_block() - Determine whether the given header describes the exact block
+ *                                     indicated.
+ * @journal: The journal to use.
+ * @header: The unpacked block header to check.
+ * @sequence: The expected sequence number.
+ * @type: The expected metadata type.
+ *
+ * Return: True if the block matches.
+ */
+static bool __must_check
+is_exact_recovery_journal_block(const struct recovery_journal *journal,
+				const struct recovery_block_header *header,
+				sequence_number_t sequence,
+				enum vdo_metadata_type type)
+{
+	return ((header->metadata_type == type) &&
+		(header->sequence_number == sequence) &&
+		(is_valid_recovery_journal_block(journal, header, true)));
+}
+
+/**
+ * find_recovery_journal_head_and_tail() - Find the tail and head of the journal.
+ *
+ * Return: True if there were valid journal blocks.
+ */
+static bool find_recovery_journal_head_and_tail(struct repair_completion *repair)
+{
+	struct recovery_journal *journal = repair->completion.vdo->recovery_journal;
+	bool found_entries = false;
+	physical_block_number_t i;
+
+	/*
+	 * Ensure that we don't replay old entries since we know the tail recorded in the super
+	 * block must be a lower bound. Not doing so can result in extra data loss by setting the
+	 * tail too early.
+	 */
+	repair->highest_tail = journal->tail;
+	for (i = 0; i < journal->size; i++) {
+		struct recovery_block_header header =
+			get_recovery_journal_block_header(journal, repair->journal_data, i);
+
+		if (!is_valid_recovery_journal_block(journal, &header, true))
+			/* This block is old or incorrectly formatted */
+			continue;
+
+		if (vdo_get_recovery_journal_block_number(journal, header.sequence_number) != i)
+			/* This block is in the wrong location */
+			continue;
+
+		if (header.sequence_number >= repair->highest_tail) {
+			found_entries = true;
+			repair->highest_tail = header.sequence_number;
+		}
+
+		if (!found_entries)
+			continue;
+
+		if (header.block_map_head > repair->block_map_head)
+			repair->block_map_head = header.block_map_head;
+
+		if (header.slab_journal_head > repair->slab_journal_head)
+			repair->slab_journal_head = header.slab_journal_head;
+	}
+
+	return found_entries;
+}
+
+/**
+ * unpack_entry(): Unpack a recovery journal entry in either format.
+ * @vdo: The vdo.
+ * @packed: The entry to unpack.
+ * @format: The expected format of the entry.
+ * @entry: The unpacked entry.
+ *
+ * Return: true if the entry should be applied.3
+ */
+static bool unpack_entry(struct vdo *vdo,
+			 char *packed,
+			 enum vdo_metadata_type format,
+			 struct recovery_journal_entry *entry)
+{
+	if (format == VDO_METADATA_RECOVERY_JOURNAL_2) {
+		struct packed_recovery_journal_entry *packed_entry =
+			(struct packed_recovery_journal_entry *) packed;
+
+		*entry = vdo_unpack_recovery_journal_entry(packed_entry);
+	} else {
+		physical_block_number_t low32, high4;
+
+		struct packed_recovery_journal_entry_1 *packed_entry =
+			(struct packed_recovery_journal_entry_1 *) packed;
+
+		if (packed_entry->operation == VDO_JOURNAL_DATA_INCREMENT)
+			entry->operation = VDO_JOURNAL_DATA_REMAPPING;
+		else if (packed_entry->operation == VDO_JOURNAL_BLOCK_MAP_INCREMENT)
+			entry->operation = VDO_JOURNAL_BLOCK_MAP_REMAPPING;
+		else
+			return false;
+
+		low32 = __le32_to_cpu(packed_entry->pbn_low_word);
+		high4 = packed_entry->pbn_high_nibble;
+		entry->slot = (struct block_map_slot) {
+			.pbn = ((high4 << 32) | low32),
+			.slot = (packed_entry->slot_low | (packed_entry->slot_high << 6)),
+		};
+		entry->mapping = vdo_unpack_block_map_entry(&packed_entry->block_map_entry);
+		entry->unmapping = (struct data_location) {
+			.pbn = VDO_ZERO_BLOCK,
+			.state = VDO_MAPPING_STATE_UNMAPPED,
+		};
+	}
+
+	return (validate_recovery_journal_entry(vdo, entry) == VDO_SUCCESS);
+}
+
+/**
+ * append_sector_entries() - Append an array of recovery journal entries from a journal block
+ *                           sector to the array of numbered mappings in the repair completion,
+ *                           numbering each entry in the order they are appended.
+ * @repair: The repair completion.
+ * @entries: The entries in the sector.
+ * @format: The format of the sector.
+ * @entry_count: The number of entries to append.
+ */
+static void append_sector_entries(struct repair_completion *repair,
+				  char *entries,
+				  enum vdo_metadata_type format,
+				  journal_entry_count_t entry_count)
+{
+	journal_entry_count_t i;
+	struct vdo *vdo = repair->completion.vdo;
+	off_t increment = ((format == VDO_METADATA_RECOVERY_JOURNAL_2)
+			   ? sizeof(struct packed_recovery_journal_entry)
+			   : sizeof(struct packed_recovery_journal_entry_1));
+
+	for (i = 0; i < entry_count; i++, entries += increment) {
+		struct recovery_journal_entry entry;
+
+		if (!unpack_entry(vdo, entries, format, &entry))
+			/* When recovering from read-only mode, ignore damaged entries. */
+			continue;
+
+		repair->entries[repair->block_map_entry_count] =
+			(struct numbered_block_mapping) {
+			.block_map_slot = entry.slot,
+			.block_map_entry = vdo_pack_block_map_entry(entry.mapping.pbn,
+								    entry.mapping.state),
+			.number = repair->block_map_entry_count,
+		};
+		repair->block_map_entry_count++;
+	}
+}
+
+static journal_entry_count_t entries_per_sector(enum vdo_metadata_type format, u8 sector_number)
+{
+	if (format == VDO_METADATA_RECOVERY_JOURNAL_2)
+		return RECOVERY_JOURNAL_ENTRIES_PER_SECTOR;
+
+	return ((sector_number == (VDO_SECTORS_PER_BLOCK - 1))
+		? RECOVERY_JOURNAL_1_ENTRIES_IN_LAST_SECTOR
+		: RECOVERY_JOURNAL_1_ENTRIES_PER_SECTOR);
+}
+
+static void extract_entries_from_block(struct repair_completion *repair,
+				       struct recovery_journal *journal,
+				       sequence_number_t sequence,
+				       enum vdo_metadata_type format,
+				       journal_entry_count_t entries)
+{
+	sector_count_t i;
+	struct recovery_block_header header =
+		get_recovery_journal_block_header(journal, repair->journal_data, sequence);
+
+	if (!is_exact_recovery_journal_block(journal, &header, sequence, format))
+		/* This block is invalid, so skip it. */
+		return;
+
+	entries = min(entries, header.entry_count);
+	for (i = 1; i < VDO_SECTORS_PER_BLOCK; i++) {
+		struct packed_journal_sector *sector =
+			get_sector(journal, repair->journal_data, sequence, i);
+		journal_entry_count_t sector_entries = min(entries, entries_per_sector(format, i));
+
+		if (vdo_is_valid_recovery_journal_sector(&header, sector, i)) {
+			/* Only extract as many as the block header calls for. */
+			append_sector_entries(repair,
+					      (char *) sector->entries,
+					      format,
+					      min_t(journal_entry_count_t,
+						    sector->entry_count,
+						    sector_entries));
+		}
+
+		/*
+		 * Even if the sector wasn't full, count it as full when counting up to the
+		 * entry count the block header claims.
+		 */
+		entries -= sector_entries;
+	}
+}
+
+static int parse_journal_for_rebuild(struct repair_completion *repair)
+{
+	int result;
+	sequence_number_t i;
+	block_count_t count;
+	enum vdo_metadata_type format;
+	struct vdo *vdo = repair->completion.vdo;
+	struct recovery_journal *journal = vdo->recovery_journal;
+	journal_entry_count_t entries_per_block = journal->entries_per_block;
+
+	format = get_recovery_journal_block_header(journal,
+						   repair->journal_data,
+						   repair->highest_tail).metadata_type;
+	if (format == VDO_METADATA_RECOVERY_JOURNAL)
+		entries_per_block = RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK;
+
+	/*
+	 * Allocate an array of numbered_block_mapping structures large enough to transcribe every
+	 * packed_recovery_journal_entry from every valid journal block.
+	 */
+	count = ((repair->highest_tail - repair->block_map_head + 1) * entries_per_block);
+	result = UDS_ALLOCATE(count, struct numbered_block_mapping, __func__, &repair->entries);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	for (i = repair->block_map_head; i <= repair->highest_tail; i++)
+		extract_entries_from_block(repair, journal, i, format, entries_per_block);
+
+	return VDO_SUCCESS;
+}
+
+static int validate_heads(struct repair_completion *repair)
+{
+	/* Both reap heads must be behind the tail. */
+	if ((repair->block_map_head <= repair->tail) &&
+	    (repair->slab_journal_head <= repair->tail))
+		return VDO_SUCCESS;
+
+
+	return uds_log_error_strerror(VDO_CORRUPT_JOURNAL,
+				      "Journal tail too early. block map head: %llu, slab journal head: %llu, tail: %llu",
+				      (unsigned long long) repair->block_map_head,
+				      (unsigned long long) repair->slab_journal_head,
+				      (unsigned long long) repair->tail);
+}
+
+/**
+ * extract_new_mappings() - Find all valid new mappings to be applied to the block map.
+ *
+ * The mappings are extracted from the journal and stored in a sortable array so that all of the
+ * mappings to be applied to a given block map page can be done in a single page fetch.
+ */
+static int extract_new_mappings(struct repair_completion *repair)
+{
+	int result;
+	struct vdo *vdo = repair->completion.vdo;
+	struct recovery_point recovery_point = {
+		.sequence_number = repair->block_map_head,
+		.sector_count = 1,
+		.entry_count = 0,
+	};
+
+	/*
+	 * Allocate an array of numbered_block_mapping structs just large enough to transcribe
+	 * every packed_recovery_journal_entry from every valid journal block.
+	 */
+	result = UDS_ALLOCATE(repair->entry_count,
+			      struct numbered_block_mapping,
+			      __func__,
+			      &repair->entries);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	for (; before_recovery_point(&recovery_point, &repair->tail_recovery_point);
+	     increment_recovery_point(&recovery_point)) {
+		struct recovery_journal_entry entry = get_entry(repair, &recovery_point);
+
+		result = validate_recovery_journal_entry(vdo, &entry);
+		if (result != VDO_SUCCESS) {
+			vdo_enter_read_only_mode(vdo, result);
+			return result;
+		}
+
+		repair->entries[repair->block_map_entry_count] =
+			(struct numbered_block_mapping) {
+			.block_map_slot = entry.slot,
+			.block_map_entry = vdo_pack_block_map_entry(entry.mapping.pbn,
+								    entry.mapping.state),
+			.number = repair->block_map_entry_count,
+		};
+		repair->block_map_entry_count++;
+	}
+
+	result = ASSERT((repair->block_map_entry_count <= repair->entry_count),
+			"approximate entry count is an upper bound");
+	if (result != VDO_SUCCESS)
+		vdo_enter_read_only_mode(vdo, result);
+
+	return result;
+}
+
+/**
+ * compute_usages() - Compute the lbns in use and block map data blocks counts from the tail of
+ *                    the journal.
+ */
+static noinline int compute_usages(struct repair_completion *repair)
+{
+	/*
+	 * VDO-5182: function is declared noinline to avoid what is likely a spurious valgrind
+	 * error about this structure being uninitialized.
+	 */
+	struct recovery_point recovery_point = {
+		.sequence_number = repair->tail,
+		.sector_count = 1,
+		.entry_count = 0,
+	};
+
+	struct vdo *vdo = repair->completion.vdo;
+	struct recovery_journal *journal = vdo->recovery_journal;
+	struct recovery_block_header header =
+		get_recovery_journal_block_header(journal, repair->journal_data, repair->tail);
+
+	repair->logical_blocks_used = header.logical_blocks_used;
+	repair->block_map_data_blocks = header.block_map_data_blocks;
+
+	for (; before_recovery_point(&recovery_point, &repair->tail_recovery_point);
+	     increment_recovery_point(&recovery_point)) {
+		struct recovery_journal_entry entry = get_entry(repair, &recovery_point);
+		int result;
+
+		result = validate_recovery_journal_entry(vdo, &entry);
+		if (result != VDO_SUCCESS) {
+			vdo_enter_read_only_mode(vdo, result);
+			return result;
+		}
+
+		if (entry.operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
+			repair->block_map_data_blocks++;
+			continue;
+		}
+
+		if (vdo_is_mapped_location(&entry.mapping))
+			repair->logical_blocks_used++;
+
+		if (vdo_is_mapped_location(&entry.unmapping))
+			repair->logical_blocks_used--;
+	}
+
+	return VDO_SUCCESS;
+}
+
+static int parse_journal_for_recovery(struct repair_completion *repair)
+{
+	int result;
+	sequence_number_t i, head;
+	bool found_entries = false;
+	struct recovery_journal *journal = repair->completion.vdo->recovery_journal;
+
+	head = min(repair->block_map_head, repair->slab_journal_head);
+	for (i = head; i <= repair->highest_tail; i++) {
+		struct recovery_block_header header;
+		journal_entry_count_t block_entries;
+		u8 j;
+
+		repair->tail = i;
+		repair->tail_recovery_point = (struct recovery_point) {
+			.sequence_number = i,
+			.sector_count = 0,
+			.entry_count = 0,
+		};
+
+		header = get_recovery_journal_block_header(journal, repair->journal_data, i);
+		if (header.metadata_type == VDO_METADATA_RECOVERY_JOURNAL) {
+			/* This is an old format block, so we need to upgrade */
+			uds_log_error_strerror(VDO_UNSUPPORTED_VERSION,
+					       "Recovery journal is in the old format, a read-only rebuild is required.");
+			vdo_enter_read_only_mode(repair->completion.vdo, VDO_UNSUPPORTED_VERSION);
+			return VDO_UNSUPPORTED_VERSION;
+		}
+
+		if (!is_exact_recovery_journal_block(journal,
+						     &header,
+						     i,
+						     VDO_METADATA_RECOVERY_JOURNAL_2))
+			/* A bad block header was found so this must be the end of the journal. */
+			break;
+
+		block_entries = header.entry_count;
+
+		/* Examine each sector in turn to determine the last valid sector. */
+		for (j = 1; j < VDO_SECTORS_PER_BLOCK; j++) {
+			struct packed_journal_sector *sector =
+				get_sector(journal, repair->journal_data, i, j);
+			journal_entry_count_t sector_entries =
+				min_t(journal_entry_count_t, sector->entry_count, block_entries);
+
+			/* A bad sector means that this block was torn. */
+			if (!vdo_is_valid_recovery_journal_sector(&header, sector, j))
+				break;
+
+			if (sector_entries > 0) {
+				found_entries = true;
+				repair->tail_recovery_point.sector_count++;
+				repair->tail_recovery_point.entry_count = sector_entries;
+				block_entries -= sector_entries;
+				repair->entry_count += sector_entries;
+			}
+
+			/* If this sector is short, the later sectors can't matter. */
+			if ((sector_entries < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) ||
+			    (block_entries == 0))
+				break;
+		}
+
+		/* If this block was not filled, or if it tore, no later block can matter. */
+		if ((header.entry_count != journal->entries_per_block) || (block_entries > 0))
+			break;
+	}
+
+	if (!found_entries)
+		return validate_heads(repair);
+
+	/* Set the tail to the last valid tail block, if there is one. */
+	if (repair->tail_recovery_point.sector_count == 0)
+		repair->tail--;
+
+	result = validate_heads(repair);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	uds_log_info("Highest-numbered recovery journal block has sequence number %llu, and the highest-numbered usable block is %llu",
+		     (unsigned long long) repair->highest_tail,
+		     (unsigned long long) repair->tail);
+
+	result = extract_new_mappings(repair);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	return compute_usages(repair);
+}
+
+static int parse_journal(struct repair_completion *repair)
+{
+	if (!find_recovery_journal_head_and_tail(repair))
+		return VDO_SUCCESS;
+
+	return (vdo_state_requires_read_only_rebuild(repair->completion.vdo->load_state) ?
+		parse_journal_for_rebuild(repair) :
+		parse_journal_for_recovery(repair));
+}
+
+static void finish_journal_load(struct vdo_completion *completion)
+{
+	struct repair_completion *repair = completion->parent;
+
+	if (++repair->vios_complete != repair->vio_count)
+		return;
+
+	uds_log_info("Finished reading recovery journal");
+	uninitialize_vios(repair);
+	prepare_repair_completion(repair, recover_block_map, VDO_ZONE_TYPE_LOGICAL);
+	vdo_continue_completion(&repair->completion, parse_journal(repair));
+}
+
+static void handle_journal_load_error(struct vdo_completion *completion)
+{
+	struct repair_completion *repair = completion->parent;
+
+	/* Preserve the error */
+	vdo_set_completion_result(&repair->completion, completion->result);
+	vio_record_metadata_io_error(as_vio(completion));
+	completion->callback(completion);
+}
+
+static void read_journal_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct vdo *vdo = vio->completion.vdo;
+
+	continue_vio_after_io(vio, finish_journal_load, vdo->thread_config.admin_thread);
+}
+
+/**
+ * vdo_repair(): Load the recovery journal and then recover or rebuild a vdo.
+ * @parent: The completion to notify when the operation is complete
+ */
+void vdo_repair(struct vdo_completion *parent)
+{
+	int result;
+	char *ptr;
+	struct repair_completion *repair;
+	struct vdo *vdo = parent->vdo;
+	struct recovery_journal *journal = vdo->recovery_journal;
+	physical_block_number_t pbn = journal->origin;
+	block_count_t remaining = journal->size;
+	block_count_t vio_count = DIV_ROUND_UP(remaining, MAX_BLOCKS_PER_VIO);
+	page_count_t page_count = min_t(page_count_t,
+					vdo->device_config->cache_size >> 1,
+					MAXIMUM_SIMULTANEOUS_VDO_BLOCK_MAP_RESTORATION_READS);
+
+	vdo_assert_on_admin_thread(vdo, __func__);
+
+	if (vdo->load_state == VDO_FORCE_REBUILD) {
+		uds_log_warning("Rebuilding reference counts to clear read-only mode");
+		vdo->states.vdo.read_only_recoveries++;
+	} else if (vdo->load_state == VDO_REBUILD_FOR_UPGRADE) {
+		uds_log_warning("Rebuilding reference counts for upgrade");
+	} else {
+		uds_log_warning("Device was dirty, rebuilding reference counts");
+	}
+
+	result = UDS_ALLOCATE_EXTENDED(struct repair_completion,
+				       page_count,
+				       struct vdo_page_completion,
+				       __func__,
+				       &repair);
+	if (result != VDO_SUCCESS) {
+		vdo_fail_completion(parent, result);
+		return;
+	}
+
+	vdo_initialize_completion(&repair->completion, vdo, VDO_REPAIR_COMPLETION);
+	repair->completion.error_handler = abort_repair;
+	repair->completion.parent = parent;
+	prepare_repair_completion(repair, finish_repair, VDO_ZONE_TYPE_ADMIN);
+	repair->page_count = page_count;
+
+	result = UDS_ALLOCATE(remaining * VDO_BLOCK_SIZE, char, __func__, &repair->journal_data);
+	if (abort_on_error(result, repair))
+		return;
+
+	result = UDS_ALLOCATE(vio_count, struct vio, __func__, &repair->vios);
+	if (abort_on_error(result, repair))
+		return;
+
+	ptr = repair->journal_data;
+	for (repair->vio_count = 0; repair->vio_count < vio_count; repair->vio_count++) {
+		block_count_t blocks = min_t(block_count_t, remaining, MAX_BLOCKS_PER_VIO);
+
+		result = allocate_vio_components(vdo,
+						 VIO_TYPE_RECOVERY_JOURNAL,
+						 VIO_PRIORITY_METADATA,
+						 repair,
+						 blocks,
+						 ptr,
+						 &repair->vios[repair->vio_count]);
+		if (abort_on_error(result, repair))
+			return;
+
+		ptr += (blocks * VDO_BLOCK_SIZE);
+		remaining -= blocks;
+	}
+
+	for (vio_count = 0;
+	     vio_count < repair->vio_count;
+	     vio_count++, pbn += MAX_BLOCKS_PER_VIO)
+		submit_metadata_vio(&repair->vios[vio_count],
+				    pbn,
+				    read_journal_endio,
+				    handle_journal_load_error,
+				    REQ_OP_READ);
+}
diff --git a/drivers/md/dm-vdo/repair.h b/drivers/md/dm-vdo/repair.h
new file mode 100644
index 00000000000..a28637d0a05
--- /dev/null
+++ b/drivers/md/dm-vdo/repair.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_REPAIR_H
+#define VDO_REPAIR_H
+
+#include "types.h"
+
+void vdo_replay_into_slab_journals(struct block_allocator *allocator, void *context);
+void vdo_repair(struct vdo_completion *parent);
+
+#endif /* VDO_REPAIR_H */
diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c
new file mode 100644
index 00000000000..61bd375b234
--- /dev/null
+++ b/drivers/md/dm-vdo/slab-depot.c
@@ -0,0 +1,5212 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "slab-depot.h"
+
+#include <linux/atomic.h>
+#include <linux/bio.h>
+#include <linux/log2.h>
+#include <linux/min_heap.h>
+#include <linux/minmax.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+#include "string-utils.h"
+
+#include "action-manager.h"
+#include "admin-state.h"
+#include "completion.h"
+#include "constants.h"
+#include "data-vio.h"
+#include "encodings.h"
+#include "io-submitter.h"
+#include "physical-zone.h"
+#include "priority-table.h"
+#include "recovery-journal.h"
+#include "repair.h"
+#include "status-codes.h"
+#include "types.h"
+#include "vdo.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+static const u64 BYTES_PER_WORD = sizeof(u64);
+static const bool NORMAL_OPERATION = true;
+
+struct slab_journal_eraser {
+	struct vdo_completion *parent;
+	struct dm_kcopyd_client *client;
+	block_count_t blocks;
+	struct slab_iterator slabs;
+};
+
+/**
+ * get_lock() - Get the lock object for a slab journal block by sequence number.
+ * @journal: vdo_slab journal to retrieve from.
+ * @sequence_number: Sequence number of the block.
+ *
+ * Return: The lock object for the given sequence number.
+ */
+static inline struct journal_lock * __must_check
+get_lock(struct slab_journal *journal, sequence_number_t sequence_number)
+{
+	return &journal->locks[sequence_number % journal->size];
+}
+
+static bool is_slab_open(struct vdo_slab *slab)
+{
+	return (!vdo_is_state_quiescing(&slab->state) && !vdo_is_state_quiescent(&slab->state));
+}
+
+/**
+ * must_make_entries_to_flush() - Check whether there are entry waiters which should delay a flush.
+ * @journal: The journal to check.
+ *
+ * Return: true if there are no entry waiters, or if the slab is unrecovered.
+ */
+static inline bool __must_check must_make_entries_to_flush(struct slab_journal *journal)
+{
+	return ((journal->slab->status != VDO_SLAB_REBUILDING) &&
+		vdo_has_waiters(&journal->entry_waiters));
+}
+
+/**
+ * is_reaping() - Check whether a reap is currently in progress.
+ * @journal: The journal which may be reaping.
+ *
+ * Return: true if the journal is reaping.
+ */
+static inline bool __must_check is_reaping(struct slab_journal *journal)
+{
+	return (journal->head != journal->unreapable);
+}
+
+/**
+ * initialize_tail_block() - Initialize tail block as a new block.
+ * @journal: The journal whose tail block is being initialized.
+ */
+static void initialize_tail_block(struct slab_journal *journal)
+{
+	struct slab_journal_block_header *header = &journal->tail_header;
+
+	header->sequence_number = journal->tail;
+	header->entry_count = 0;
+	header->has_block_map_increments = false;
+}
+
+/**
+ * initialize_journal_state() - Set all journal fields appropriately to start journaling.
+ * @journal: The journal to be reset, based on its tail sequence number.
+ */
+static void initialize_journal_state(struct slab_journal *journal)
+{
+	journal->unreapable = journal->head;
+	journal->reap_lock = get_lock(journal, journal->unreapable);
+	journal->next_commit = journal->tail;
+	journal->summarized = journal->last_summarized = journal->tail;
+	initialize_tail_block(journal);
+}
+
+/**
+ * block_is_full() - Check whether a journal block is full.
+ * @journal: The slab journal for the block.
+ *
+ * Return: true if the tail block is full.
+ */
+static bool __must_check block_is_full(struct slab_journal *journal)
+{
+	journal_entry_count_t count = journal->tail_header.entry_count;
+
+	return (journal->tail_header.has_block_map_increments ?
+		(journal->full_entries_per_block == count) :
+		(journal->entries_per_block == count));
+}
+
+static void add_entries(struct slab_journal *journal);
+static void update_tail_block_location(struct slab_journal *journal);
+static void release_journal_locks(struct waiter *waiter, void *context);
+
+/**
+ * is_slab_journal_blank() - Check whether a slab's journal is blank.
+ *
+ * A slab journal is blank if it has never had any entries recorded in it.
+ *
+ * Return: true if the slab's journal has never been modified.
+ */
+static bool is_slab_journal_blank(const struct vdo_slab *slab)
+{
+	return ((slab->journal.tail == 1) && (slab->journal.tail_header.entry_count == 0));
+}
+
+/**
+ * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct
+ *                             order.
+ * @journal: The journal to be marked dirty.
+ * @lock: The recovery journal lock held by the slab journal.
+ */
+static void mark_slab_journal_dirty(struct slab_journal *journal, sequence_number_t lock)
+{
+	struct slab_journal *dirty_journal;
+	struct list_head *dirty_list = &journal->slab->allocator->dirty_slab_journals;
+
+	ASSERT_LOG_ONLY(journal->recovery_lock == 0, "slab journal was clean");
+
+	journal->recovery_lock = lock;
+	list_for_each_entry_reverse(dirty_journal, dirty_list, dirty_entry) {
+		if (dirty_journal->recovery_lock <= journal->recovery_lock)
+			break;
+	}
+
+	list_move_tail(&journal->dirty_entry, dirty_journal->dirty_entry.next);
+}
+
+static void mark_slab_journal_clean(struct slab_journal *journal)
+{
+	journal->recovery_lock = 0;
+	list_del_init(&journal->dirty_entry);
+}
+
+static void check_if_slab_drained(struct vdo_slab *slab)
+{
+	bool read_only;
+	struct slab_journal *journal = &slab->journal;
+	const struct admin_state_code *code;
+
+	if (!vdo_is_state_draining(&slab->state) ||
+	    must_make_entries_to_flush(journal) ||
+	    is_reaping(journal) ||
+	    journal->waiting_to_commit ||
+	    !list_empty(&journal->uncommitted_blocks) ||
+	    journal->updating_slab_summary ||
+	    (slab->active_count > 0))
+		return;
+
+	/* When not suspending or recovering, the slab must be clean. */
+	code = vdo_get_admin_state_code(&slab->state);
+	read_only = vdo_is_read_only(slab->allocator->depot->vdo);
+	if (!read_only &&
+	    vdo_has_waiters(&slab->dirty_blocks) &&
+	    (code != VDO_ADMIN_STATE_SUSPENDING) &&
+	    (code != VDO_ADMIN_STATE_RECOVERING))
+		return;
+
+	vdo_finish_draining_with_result(&slab->state, (read_only ? VDO_READ_ONLY : VDO_SUCCESS));
+}
+
+/* FULLNESS HINT COMPUTATION */
+
+/**
+ * compute_fullness_hint() - Translate a slab's free block count into a 'fullness hint' that can be
+ *                           stored in a slab_summary_entry's 7 bits that are dedicated to its free
+ *                           count.
+ * @depot: The depot whose summary being updated.
+ * @free_blocks: The number of free blocks.
+ *
+ * Note: the number of free blocks must be strictly less than 2^23 blocks, even though
+ * theoretically slabs could contain precisely 2^23 blocks; there is an assumption that at least
+ * one block is used by metadata. This assumption is necessary; otherwise, the fullness hint might
+ * overflow. The fullness hint formula is roughly (fullness >> 16) & 0x7f, but (2^23 >> 16) & 0x7f
+ * is 0, which would make it impossible to distinguish completely full from completely empty.
+ *
+ * Return: A fullness hint, which can be stored in 7 bits.
+ */
+static u8 __must_check compute_fullness_hint(struct slab_depot *depot, block_count_t free_blocks)
+{
+	block_count_t hint;
+
+	ASSERT_LOG_ONLY((free_blocks < (1 << 23)), "free blocks must be less than 2^23");
+
+	if (free_blocks == 0)
+		return 0;
+
+	hint = free_blocks >> depot->hint_shift;
+	return ((hint == 0) ? 1 : hint);
+}
+
+/**
+ * check_summary_drain_complete() - Check whether an allocators summary has finished draining.
+ */
+static void check_summary_drain_complete(struct block_allocator *allocator)
+{
+	struct vdo *vdo = allocator->depot->vdo;
+
+	if (!vdo_is_state_draining(&allocator->summary_state) ||
+	    (allocator->summary_write_count > 0))
+		return;
+
+	vdo_finish_operation(&allocator->summary_state,
+			     (vdo_is_read_only(vdo) ? VDO_READ_ONLY : VDO_SUCCESS));
+}
+
+/**
+ * notify_summary_waiters() - Wake all the waiters in a given queue.
+ * @allocator: The block allocator summary which owns the queue.
+ * @queue: The queue to notify.
+ */
+static void notify_summary_waiters(struct block_allocator *allocator, struct wait_queue *queue)
+{
+	int result = (vdo_is_read_only(allocator->depot->vdo) ? VDO_READ_ONLY : VDO_SUCCESS);
+
+	vdo_notify_all_waiters(queue, NULL, &result);
+}
+
+static void launch_write(struct slab_summary_block *summary_block);
+
+/**
+ * finish_updating_slab_summary_block() - Finish processing a block which attempted to write,
+ *                                        whether or not the attempt succeeded.
+ * @block: The block.
+ */
+static void finish_updating_slab_summary_block(struct slab_summary_block *block)
+{
+	notify_summary_waiters(block->allocator, &block->current_update_waiters);
+	block->writing = false;
+	block->allocator->summary_write_count--;
+	if (vdo_has_waiters(&block->next_update_waiters))
+		launch_write(block);
+	else
+		check_summary_drain_complete(block->allocator);
+}
+
+/**
+ * finish_update() - This is the callback for a successful summary block write.
+ * @completion: The write vio.
+ */
+static void finish_update(struct vdo_completion *completion)
+{
+	struct slab_summary_block *block =
+		container_of(as_vio(completion), struct slab_summary_block, vio);
+
+	atomic64_inc(&block->allocator->depot->summary_statistics.blocks_written);
+	finish_updating_slab_summary_block(block);
+}
+
+/**
+ * handle_write_error() - Handle an error writing a slab summary block.
+ * @completion: The write VIO.
+ */
+static void handle_write_error(struct vdo_completion *completion)
+{
+	struct slab_summary_block *block =
+		container_of(as_vio(completion), struct slab_summary_block, vio);
+
+	vio_record_metadata_io_error(as_vio(completion));
+	vdo_enter_read_only_mode(completion->vdo, completion->result);
+	finish_updating_slab_summary_block(block);
+}
+
+static void write_slab_summary_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct slab_summary_block *block = container_of(vio, struct slab_summary_block, vio);
+
+	continue_vio_after_io(vio, finish_update, block->allocator->thread_id);
+}
+
+/**
+ * launch_write() - Write a slab summary block unless it is currently out for writing.
+ * @block: The block that needs to be committed.
+ */
+static void launch_write(struct slab_summary_block *block)
+{
+	struct block_allocator *allocator = block->allocator;
+	struct slab_depot *depot = allocator->depot;
+	physical_block_number_t pbn;
+
+	if (block->writing)
+		return;
+
+	allocator->summary_write_count++;
+	vdo_transfer_all_waiters(&block->next_update_waiters, &block->current_update_waiters);
+	block->writing = true;
+
+	if (vdo_is_read_only(depot->vdo)) {
+		finish_updating_slab_summary_block(block);
+		return;
+	}
+
+	memcpy(block->outgoing_entries, block->entries, VDO_BLOCK_SIZE);
+
+	/*
+	 * Flush before writing to ensure that the slab journal tail blocks and reference updates
+	 * covered by this summary update are stable (VDO-2332).
+	 */
+	pbn = (depot->summary_origin +
+	       (VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE * allocator->zone_number) +
+	       block->index);
+	submit_metadata_vio(&block->vio,
+			    pbn,
+			    write_slab_summary_endio,
+			    handle_write_error,
+			    REQ_OP_WRITE | REQ_PREFLUSH);
+}
+
+/**
+ * update_slab_summary_entry() - Update the entry for a slab.
+ * @slab: The slab whose entry is to be updated
+ * @waiter: The waiter that is updating the summary.
+ * @tail_block_offset: The offset of the slab journal's tail block.
+ * @load_ref_counts: Whether the reference counts must be loaded from disk on the vdo load.
+ * @is_clean: Whether the slab is clean.
+ * @free_blocks: The number of free blocks.
+ */
+static void
+update_slab_summary_entry(struct vdo_slab *slab,
+			  struct waiter *waiter,
+			  tail_block_offset_t tail_block_offset,
+			  bool load_ref_counts,
+			  bool is_clean,
+			  block_count_t free_blocks)
+{
+	u8 index = slab->slab_number / VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK;
+	struct block_allocator *allocator = slab->allocator;
+	struct slab_summary_block *block = &allocator->summary_blocks[index];
+	int result;
+	struct slab_summary_entry *entry;
+
+	if (vdo_is_read_only(block->vio.completion.vdo)) {
+		result = VDO_READ_ONLY;
+		waiter->callback(waiter, &result);
+		return;
+	}
+
+	if (vdo_is_state_draining(&allocator->summary_state) ||
+	    vdo_is_state_quiescent(&allocator->summary_state)) {
+		result = VDO_INVALID_ADMIN_STATE;
+		waiter->callback(waiter, &result);
+		return;
+	}
+
+	entry = &allocator->summary_entries[slab->slab_number];
+	*entry = (struct slab_summary_entry) {
+		.tail_block_offset = tail_block_offset,
+		.load_ref_counts = (entry->load_ref_counts || load_ref_counts),
+		.is_dirty = !is_clean,
+		.fullness_hint = compute_fullness_hint(allocator->depot, free_blocks),
+	};
+	vdo_enqueue_waiter(&block->next_update_waiters, waiter);
+	launch_write(block);
+}
+
+/**
+ * finish_reaping() - Actually advance the head of the journal now that any necessary flushes are
+ *                    complete.
+ * @journal: The journal to be reaped.
+ */
+static void finish_reaping(struct slab_journal *journal)
+{
+	journal->head = journal->unreapable;
+	add_entries(journal);
+	check_if_slab_drained(journal->slab);
+}
+
+static void reap_slab_journal(struct slab_journal *journal);
+
+/**
+ * complete_reaping() - Finish reaping now that we have flushed the lower layer and then try
+ *                      reaping again in case we deferred reaping due to an outstanding vio.
+ * @completion: The flush vio.
+ */
+static void complete_reaping(struct vdo_completion *completion)
+{
+	struct slab_journal *journal = completion->parent;
+
+	return_vio_to_pool(journal->slab->allocator->vio_pool,
+			   vio_as_pooled_vio(as_vio(UDS_FORGET(completion))));
+	finish_reaping(journal);
+	reap_slab_journal(journal);
+}
+
+/**
+ * handle_flush_error() - Handle an error flushing the lower layer.
+ * @completion: The flush vio.
+ */
+static void handle_flush_error(struct vdo_completion *completion)
+{
+	vio_record_metadata_io_error(as_vio(completion));
+	vdo_enter_read_only_mode(completion->vdo, completion->result);
+	complete_reaping(completion);
+}
+
+static void flush_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct slab_journal *journal = vio->completion.parent;
+
+	continue_vio_after_io(vio,
+			      complete_reaping,
+			      journal->slab->allocator->thread_id);
+}
+
+/**
+ * flush_for_reaping() - A waiter callback for getting a vio with which to flush the lower layer
+ *                       prior to reaping.
+ * @waiter: The journal as a flush waiter.
+ * @context: The newly acquired flush vio.
+ */
+static void flush_for_reaping(struct waiter *waiter, void *context)
+{
+	struct slab_journal *journal = container_of(waiter, struct slab_journal, flush_waiter);
+	struct pooled_vio *pooled = context;
+	struct vio *vio = &pooled->vio;
+
+	vio->completion.parent = journal;
+	submit_flush_vio(vio, flush_endio, handle_flush_error);
+}
+
+/**
+ * reap_slab_journal() - Conduct a reap on a slab journal to reclaim unreferenced blocks.
+ * @journal: The slab journal.
+ */
+static void reap_slab_journal(struct slab_journal *journal)
+{
+	bool reaped = false;
+
+	if (is_reaping(journal))
+		/* We already have a reap in progress so wait for it to finish. */
+		return;
+
+	if ((journal->slab->status != VDO_SLAB_REBUILT) ||
+	    !vdo_is_state_normal(&journal->slab->state) ||
+	    vdo_is_read_only(journal->slab->allocator->depot->vdo))
+		/*
+		 * We must not reap in the first two cases, and there's no point in read-only mode.
+		 */
+		return;
+
+	/*
+	 * Start reclaiming blocks only when the journal head has no references. Then stop when a
+	 * block is referenced or reap reaches the most recently written block, referenced by the
+	 * slab summary, which has the sequence number just before the tail.
+	 */
+	while ((journal->unreapable < journal->tail) && (journal->reap_lock->count == 0)) {
+		reaped = true;
+		journal->unreapable++;
+		journal->reap_lock++;
+		if (journal->reap_lock == &journal->locks[journal->size])
+			journal->reap_lock = &journal->locks[0];
+	}
+
+	if (!reaped)
+		return;
+
+	/*
+	 * It is never safe to reap a slab journal block without first issuing a flush, regardless
+	 * of whether a user flush has been received or not. In the absence of the flush, the
+	 * reference block write which released the locks allowing the slab journal to reap may not
+	 * be persisted. Although slab summary writes will eventually issue flushes, multiple slab
+	 * journal block writes can be issued while previous slab summary updates have not yet been
+	 * made. Even though those slab journal block writes will be ignored if the slab summary
+	 * update is not persisted, they may still overwrite the to-be-reaped slab journal block
+	 * resulting in a loss of reference count updates (VDO-2912).
+	 */
+	journal->flush_waiter.callback = flush_for_reaping;
+	acquire_vio_from_pool(journal->slab->allocator->vio_pool, &journal->flush_waiter);
+}
+
+/**
+ * adjust_slab_journal_block_reference() - Adjust the reference count for a slab journal block.
+ * @journal: The slab journal.
+ * @sequence_number: The journal sequence number of the referenced block.
+ * @adjustment: Amount to adjust the reference counter.
+ *
+ * Note that when the adjustment is negative, the slab journal will be reaped.
+ */
+static void
+adjust_slab_journal_block_reference(struct slab_journal *journal,
+				    sequence_number_t sequence_number,
+				    int adjustment)
+{
+	struct journal_lock *lock;
+
+	if (sequence_number == 0)
+		return;
+
+	if (journal->slab->status == VDO_SLAB_REPLAYING)
+		/* Locks should not be used during offline replay. */
+		return;
+
+	ASSERT_LOG_ONLY((adjustment != 0), "adjustment must be non-zero");
+	lock = get_lock(journal, sequence_number);
+	if (adjustment < 0)
+		ASSERT_LOG_ONLY((-adjustment <= lock->count),
+				"adjustment %d of lock count %u for slab journal block %llu must not underflow",
+				adjustment,
+				lock->count,
+				(unsigned long long) sequence_number);
+
+	lock->count += adjustment;
+	if (lock->count == 0)
+		reap_slab_journal(journal);
+}
+
+/**
+ * release_journal_locks() - Callback invoked after a slab summary update completes.
+ * @waiter: The slab summary waiter that has just been notified.
+ * @context: The result code of the update.
+ *
+ * Registered in the constructor on behalf of update_tail_block_location().
+ *
+ * Implements waiter_callback.
+ */
+static void release_journal_locks(struct waiter *waiter, void *context)
+{
+	sequence_number_t first, i;
+	struct slab_journal *journal =
+		container_of(waiter, struct slab_journal, slab_summary_waiter);
+	int result = *((int *)context);
+
+	if (result != VDO_SUCCESS) {
+		if (result != VDO_READ_ONLY)
+			/*
+			 * Don't bother logging what might be lots of errors if we are already in
+			 * read-only mode.
+			 */
+			uds_log_error_strerror(result,
+					       "failed slab summary update %llu",
+					       (unsigned long long) journal->summarized);
+
+		journal->updating_slab_summary = false;
+		vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
+		check_if_slab_drained(journal->slab);
+		return;
+	}
+
+	if (journal->partial_write_in_progress && (journal->summarized == journal->tail)) {
+		journal->partial_write_in_progress = false;
+		add_entries(journal);
+	}
+
+	first = journal->last_summarized;
+	journal->last_summarized = journal->summarized;
+	for (i = journal->summarized - 1; i >= first; i--) {
+		/*
+		 * Release the lock the summarized block held on the recovery journal. (During
+		 * replay, recovery_start will always be 0.)
+		 */
+		if (journal->recovery_journal != NULL) {
+			zone_count_t zone_number = journal->slab->allocator->zone_number;
+
+			vdo_release_recovery_journal_block_reference(journal->recovery_journal,
+								     get_lock(journal, i)->recovery_start,
+								     VDO_ZONE_TYPE_PHYSICAL,
+								     zone_number);
+		}
+
+		/*
+		 * Release our own lock against reaping for blocks that are committed. (This
+		 * function will not change locks during replay.)
+		 */
+		adjust_slab_journal_block_reference(journal, i, -1);
+	}
+
+	journal->updating_slab_summary = false;
+
+	reap_slab_journal(journal);
+
+	/* Check if the slab summary needs to be updated again. */
+	update_tail_block_location(journal);
+}
+
+/**
+ * update_tail_block_location() - Update the tail block location in the slab summary, if necessary.
+ * @journal: The slab journal that is updating its tail block location.
+ */
+static void update_tail_block_location(struct slab_journal *journal)
+{
+	block_count_t free_block_count;
+	struct vdo_slab *slab = journal->slab;
+
+	if (journal->updating_slab_summary ||
+	    vdo_is_read_only(journal->slab->allocator->depot->vdo) ||
+	    (journal->last_summarized >= journal->next_commit)) {
+		check_if_slab_drained(slab);
+		return;
+	}
+
+	if (slab->status != VDO_SLAB_REBUILT) {
+		u8 hint = slab->allocator->summary_entries[slab->slab_number].fullness_hint;
+
+		free_block_count = ((block_count_t) hint) << slab->allocator->depot->hint_shift;
+	} else {
+		free_block_count = slab->free_blocks;
+	}
+
+	journal->summarized = journal->next_commit;
+	journal->updating_slab_summary = true;
+
+	/*
+	 * Update slab summary as dirty.
+	 * vdo_slab journal can only reap past sequence number 1 when all the ref counts for this
+	 * slab have been written to the layer. Therefore, indicate that the ref counts must be
+	 * loaded when the journal head has reaped past sequence number 1.
+	 */
+	update_slab_summary_entry(slab,
+				  &journal->slab_summary_waiter,
+				  journal->summarized % journal->size,
+				  (journal->head > 1),
+				  false,
+				  free_block_count);
+}
+
+/**
+ * reopen_slab_journal() - Reopen a slab's journal by emptying it and then adding pending entries.
+ */
+static void reopen_slab_journal(struct vdo_slab *slab)
+{
+	struct slab_journal *journal = &slab->journal;
+	sequence_number_t block;
+
+	ASSERT_LOG_ONLY(journal->tail_header.entry_count == 0,
+			"vdo_slab journal's active block empty before reopening");
+	journal->head = journal->tail;
+	initialize_journal_state(journal);
+
+	/* Ensure no locks are spuriously held on an empty journal. */
+	for (block = 1; block <= journal->size; block++)
+		ASSERT_LOG_ONLY((get_lock(journal, block)->count == 0),
+				"Scrubbed journal's block %llu is not locked",
+				(unsigned long long) block);
+
+	add_entries(journal);
+}
+
+static sequence_number_t get_committing_sequence_number(const struct pooled_vio *vio)
+{
+	const struct packed_slab_journal_block *block =
+		(const struct packed_slab_journal_block *) vio->vio.data;
+
+	return __le64_to_cpu(block->header.sequence_number);
+}
+
+/**
+ * complete_write() - Handle post-commit processing.
+ * @completion: The write vio as a completion.
+ *
+ * This is the callback registered by write_slab_journal_block().
+ */
+static void complete_write(struct vdo_completion *completion)
+{
+	int result = completion->result;
+	struct pooled_vio *pooled = vio_as_pooled_vio(as_vio(completion));
+	struct slab_journal *journal = completion->parent;
+	sequence_number_t committed = get_committing_sequence_number(pooled);
+
+	list_del_init(&pooled->list_entry);
+	return_vio_to_pool(journal->slab->allocator->vio_pool, UDS_FORGET(pooled));
+
+	if (result != VDO_SUCCESS) {
+		vio_record_metadata_io_error(as_vio(completion));
+		uds_log_error_strerror(result,
+				       "cannot write slab journal block %llu",
+				       (unsigned long long) committed);
+		vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
+		check_if_slab_drained(journal->slab);
+		return;
+	}
+
+	WRITE_ONCE(journal->events->blocks_written, journal->events->blocks_written + 1);
+
+	if (list_empty(&journal->uncommitted_blocks)) {
+		/* If no blocks are outstanding, then the commit point is at the tail. */
+		journal->next_commit = journal->tail;
+	} else {
+		/* The commit point is always the beginning of the oldest incomplete block. */
+		pooled = container_of(journal->uncommitted_blocks.next,
+				      struct pooled_vio,
+				      list_entry);
+		journal->next_commit = get_committing_sequence_number(pooled);
+	}
+
+	update_tail_block_location(journal);
+}
+
+static void write_slab_journal_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct slab_journal *journal = vio->completion.parent;
+
+	continue_vio_after_io(vio, complete_write, journal->slab->allocator->thread_id);
+}
+
+/**
+ * write_slab_journal_block() - Write a slab journal block.
+ * @waiter: The vio pool waiter which was just notified.
+ * @context: The vio pool entry for the write.
+ *
+ * Callback from acquire_vio_from_pool() registered in commit_tail().
+ */
+static void write_slab_journal_block(struct waiter *waiter, void *context)
+{
+	struct pooled_vio *pooled = context;
+	struct vio *vio = &pooled->vio;
+	struct slab_journal *journal = container_of(waiter, struct slab_journal, resource_waiter);
+	struct slab_journal_block_header *header = &journal->tail_header;
+	int unused_entries = journal->entries_per_block - header->entry_count;
+	physical_block_number_t block_number;
+	const struct admin_state_code *operation;
+
+	header->head = journal->head;
+	list_add_tail(&pooled->list_entry, &journal->uncommitted_blocks);
+	vdo_pack_slab_journal_block_header(header, &journal->block->header);
+
+	/* Copy the tail block into the vio. */
+	memcpy(pooled->vio.data, journal->block, VDO_BLOCK_SIZE);
+
+	ASSERT_LOG_ONLY(unused_entries >= 0, "vdo_slab journal block is not overfull");
+	if (unused_entries > 0) {
+		/*
+		 * Release the per-entry locks for any unused entries in the block we are about to
+		 * write.
+		 */
+		adjust_slab_journal_block_reference(journal,
+						    header->sequence_number,
+						    -unused_entries);
+		journal->partial_write_in_progress = !block_is_full(journal);
+	}
+
+	block_number = journal->slab->journal_origin + (header->sequence_number % journal->size);
+	vio->completion.parent = journal;
+
+	/*
+	 * This block won't be read in recovery until the slab summary is updated to refer to it.
+	 * The slab summary update does a flush which is sufficient to protect us from VDO-2331.
+	 */
+	submit_metadata_vio(UDS_FORGET(vio),
+			    block_number,
+			    write_slab_journal_endio,
+			    complete_write,
+			    REQ_OP_WRITE);
+
+	/* Since the write is submitted, the tail block structure can be reused. */
+	journal->tail++;
+	initialize_tail_block(journal);
+	journal->waiting_to_commit = false;
+
+	operation = vdo_get_admin_state_code(&journal->slab->state);
+	if (operation == VDO_ADMIN_STATE_WAITING_FOR_RECOVERY) {
+		vdo_finish_operation(&journal->slab->state,
+				     (vdo_is_read_only(journal->slab->allocator->depot->vdo) ?
+				      VDO_READ_ONLY :
+				      VDO_SUCCESS));
+		return;
+	}
+
+	add_entries(journal);
+}
+
+/**
+ * commit_tail() - Commit the tail block of the slab journal.
+ * @journal: The journal whose tail block should be committed.
+ */
+static void commit_tail(struct slab_journal *journal)
+{
+	if ((journal->tail_header.entry_count == 0) && must_make_entries_to_flush(journal))
+		/*
+		 * There are no entries at the moment, but there are some waiters, so defer
+		 * initiating the flush until those entries are ready to write.
+		 */
+		return;
+
+	if (vdo_is_read_only(journal->slab->allocator->depot->vdo) ||
+	    journal->waiting_to_commit ||
+	    (journal->tail_header.entry_count == 0))
+		/*
+		 * There is nothing to do since the tail block is empty, or writing, or the journal
+		 * is in read-only mode.
+		 */
+		return;
+
+	/*
+	 * Since we are about to commit the tail block, this journal no longer needs to be on the
+	 * ring of journals which the recovery journal might ask to commit.
+	 */
+	mark_slab_journal_clean(journal);
+
+	journal->waiting_to_commit = true;
+
+	journal->resource_waiter.callback = write_slab_journal_block;
+	acquire_vio_from_pool(journal->slab->allocator->vio_pool, &journal->resource_waiter);
+}
+
+/**
+ * encode_slab_journal_entry() - Encode a slab journal entry.
+ * @tail_header: The unpacked header for the block.
+ * @payload: The journal block payload to hold the entry.
+ * @sbn: The slab block number of the entry to encode.
+ * @operation: The type of the entry.
+ * @increment: True if this is an increment.
+ *
+ * Exposed for unit tests.
+ */
+static void
+encode_slab_journal_entry(struct slab_journal_block_header *tail_header,
+			  slab_journal_payload *payload,
+			  slab_block_number sbn,
+			  enum journal_operation operation,
+			  bool increment)
+{
+	journal_entry_count_t entry_number = tail_header->entry_count++;
+
+	if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
+		if (!tail_header->has_block_map_increments) {
+			memset(payload->full_entries.entry_types,
+			       0,
+			       VDO_SLAB_JOURNAL_ENTRY_TYPES_SIZE);
+			tail_header->has_block_map_increments = true;
+		}
+
+		payload->full_entries.entry_types[entry_number / 8] |=
+			((u8)1 << (entry_number % 8));
+	}
+
+	vdo_pack_slab_journal_entry(&payload->entries[entry_number], sbn, increment);
+}
+
+/**
+ * expand_journal_point() - Convert a recovery journal journal_point which refers to both an
+ *                          increment and a decrement to a single point which refers to one or the
+ *                          other.
+ * @recovery_point: The journal point to convert.
+ * @increment: Whether the current entry is an increment.
+ *
+ * Return: The expanded journal point
+ *
+ * Because each data_vio has but a single recovery journal point, but may need to make both
+ * increment and decrement entries in the same slab journal. In order to distinguish the two
+ * entries, the entry count of the expanded journal point is twice the actual recovery journal
+ * entry count for increments, and one more than that for decrements.
+ */
+static struct journal_point
+expand_journal_point(struct journal_point recovery_point, bool increment)
+{
+	recovery_point.entry_count *= 2;
+	if (!increment)
+		recovery_point.entry_count++;
+
+	return recovery_point;
+}
+
+/**
+ * add_entry() - Actually add an entry to the slab journal, potentially firing off a write if a
+ *               block becomes full.
+ * @journal: The slab journal to append to.
+ * @pbn: The pbn being adjusted.
+ * @operation: The type of entry to make.
+ * @increment: True if this is an increment.
+ * @recovery_point: The expanded recovery point.
+ *
+ * This function is synchronous.
+ */
+static void add_entry(struct slab_journal *journal,
+		      physical_block_number_t pbn,
+		      enum journal_operation operation,
+		      bool increment,
+		      struct journal_point recovery_point)
+{
+	struct packed_slab_journal_block *block = journal->block;
+	int result;
+
+	result = ASSERT(vdo_before_journal_point(&journal->tail_header.recovery_point,
+						 &recovery_point),
+			"recovery journal point is monotonically increasing, recovery point: %llu.%u, block recovery point: %llu.%u",
+			(unsigned long long) recovery_point.sequence_number,
+			recovery_point.entry_count,
+			(unsigned long long) journal->tail_header.recovery_point.sequence_number,
+			journal->tail_header.recovery_point.entry_count);
+	if (result != VDO_SUCCESS) {
+		vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
+		return;
+	}
+
+	if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
+		result = ASSERT((journal->tail_header.entry_count <
+				 journal->full_entries_per_block),
+				"block has room for full entries");
+		if (result != VDO_SUCCESS) {
+			vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
+			return;
+		}
+	}
+
+	encode_slab_journal_entry(&journal->tail_header,
+				  &block->payload,
+				  pbn - journal->slab->start,
+				  operation,
+				  increment);
+	journal->tail_header.recovery_point = recovery_point;
+	if (block_is_full(journal))
+		commit_tail(journal);
+}
+
+static inline block_count_t journal_length(const struct slab_journal *journal)
+{
+	return journal->tail - journal->head;
+}
+
+/**
+ * vdo_attempt_replay_into_slab() - Replay a recovery journal entry into a slab's journal.
+ * @slab: The slab to play into.
+ * @pbn: The PBN for the entry.
+ * @operation: The type of entry to add.
+ * @increment: True if this entry is an increment.
+ * @recovery_point: The recovery journal point corresponding to this entry.
+ * @parent: The completion to notify when there is space to add the entry if the entry could not be
+ *          added immediately.
+ *
+ * Return: true if the entry was added immediately.
+ */
+bool vdo_attempt_replay_into_slab(struct vdo_slab *slab,
+				  physical_block_number_t pbn,
+				  enum journal_operation operation,
+				  bool increment,
+				  struct journal_point *recovery_point,
+				  struct vdo_completion *parent)
+{
+	struct slab_journal *journal = &slab->journal;
+	struct slab_journal_block_header *header = &journal->tail_header;
+	struct journal_point expanded = expand_journal_point(*recovery_point, increment);
+
+	/* Only accept entries after the current recovery point. */
+	if (!vdo_before_journal_point(&journal->tail_header.recovery_point, &expanded))
+		return true;
+
+	if ((header->entry_count >= journal->full_entries_per_block) &&
+	    (header->has_block_map_increments || (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING)))
+		/*
+		 * The tail block does not have room for the entry we are attempting to add so
+		 * commit the tail block now.
+		 */
+		commit_tail(journal);
+
+	if (journal->waiting_to_commit) {
+		vdo_start_operation_with_waiter(&journal->slab->state,
+						VDO_ADMIN_STATE_WAITING_FOR_RECOVERY,
+						parent,
+						NULL);
+		return false;
+	}
+
+	if (journal_length(journal) >= journal->size) {
+		/*
+		 * We must have reaped the current head before the crash, since the blocked
+		 * threshold keeps us from having more entries than fit in a slab journal; hence we
+		 * can just advance the head (and unreapable block), as needed.
+		 */
+		journal->head++;
+		journal->unreapable++;
+	}
+
+	if (journal->slab->status == VDO_SLAB_REBUILT)
+		journal->slab->status = VDO_SLAB_REPLAYING;
+
+	add_entry(journal, pbn, operation, increment, expanded);
+	return true;
+}
+
+/**
+ * requires_reaping() - Check whether the journal must be reaped before adding new entries.
+ * @journal: The journal to check.
+ *
+ * Return: true if the journal must be reaped.
+ */
+static bool requires_reaping(const struct slab_journal *journal)
+{
+	return (journal_length(journal) >= journal->blocking_threshold);
+}
+
+/** finish_summary_update() - A waiter callback that resets the writing state of a slab. */
+static void finish_summary_update(struct waiter *waiter, void *context)
+{
+	struct vdo_slab *slab = container_of(waiter, struct vdo_slab, summary_waiter);
+	int result = *((int *) context);
+
+	slab->active_count--;
+
+	if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) {
+		uds_log_error_strerror(result, "failed to update slab summary");
+		vdo_enter_read_only_mode(slab->allocator->depot->vdo, result);
+	}
+
+	check_if_slab_drained(slab);
+}
+
+static void write_reference_block(struct waiter *waiter, void *context);
+
+/**
+ * launch_reference_block_write() - Launch the write of a dirty reference block by first acquiring
+ *                                  a VIO for it from the pool.
+ * @waiter: The waiter of the block which is starting to write.
+ * @context: The parent slab of the block.
+ *
+ * This can be asynchronous since the writer will have to wait if all VIOs in the pool are
+ * currently in use.
+ */
+static void launch_reference_block_write(struct waiter *waiter, void *context)
+{
+	struct vdo_slab *slab = context;
+
+	if (vdo_is_read_only(slab->allocator->depot->vdo))
+		return;
+
+	slab->active_count++;
+	container_of(waiter, struct reference_block, waiter)->is_writing = true;
+	waiter->callback = write_reference_block;
+	acquire_vio_from_pool(slab->allocator->vio_pool, waiter);
+}
+
+static void save_dirty_reference_blocks(struct vdo_slab *slab)
+{
+	vdo_notify_all_waiters(&slab->dirty_blocks, launch_reference_block_write, slab);
+	check_if_slab_drained(slab);
+}
+
+/**
+ * finish_reference_block_write() - After a reference block has written, clean it, release its
+ *                                  locks, and return its VIO to the pool.
+ * @completion: The VIO that just finished writing.
+ */
+static void finish_reference_block_write(struct vdo_completion *completion)
+{
+	struct vio *vio = as_vio(completion);
+	struct pooled_vio *pooled = vio_as_pooled_vio(vio);
+	struct reference_block *block = completion->parent;
+	struct vdo_slab *slab = block->slab;
+	tail_block_offset_t offset;
+
+	slab->active_count--;
+
+	/* Release the slab journal lock. */
+	adjust_slab_journal_block_reference(&slab->journal,
+					    block->slab_journal_lock_to_release,
+					    -1);
+	return_vio_to_pool(slab->allocator->vio_pool, pooled);
+
+	/*
+	 * We can't clear the is_writing flag earlier as releasing the slab journal lock may cause
+	 * us to be dirtied again, but we don't want to double enqueue.
+	 */
+	block->is_writing = false;
+
+	if (vdo_is_read_only(completion->vdo)) {
+		check_if_slab_drained(slab);
+		return;
+	}
+
+	/* Re-queue the block if it was re-dirtied while it was writing. */
+	if (block->is_dirty) {
+		vdo_enqueue_waiter(&block->slab->dirty_blocks, &block->waiter);
+		if (vdo_is_state_draining(&slab->state))
+			/* We must be saving, and this block will otherwise not be relaunched. */
+			save_dirty_reference_blocks(slab);
+
+		return;
+	}
+
+	/*
+	 * Mark the slab as clean in the slab summary if there are no dirty or writing blocks
+	 * and no summary update in progress.
+	 */
+	if ((slab->active_count > 0) || vdo_has_waiters(&slab->dirty_blocks))
+		return;
+
+	offset = slab->allocator->summary_entries[slab->slab_number].tail_block_offset;
+	slab->active_count++;
+	slab->summary_waiter.callback = finish_summary_update;
+	update_slab_summary_entry(slab,
+				  &slab->summary_waiter,
+				  offset,
+				  true,
+				  true,
+				  slab->free_blocks);
+}
+
+/**
+ * get_reference_counters_for_block() - Find the reference counters for a given block.
+ * @block: The reference_block in question.
+ *
+ * Return: A pointer to the reference counters for this block.
+ */
+static vdo_refcount_t * __must_check
+get_reference_counters_for_block(struct reference_block *block)
+{
+	size_t block_index = block - block->slab->reference_blocks;
+
+	return &block->slab->counters[block_index * COUNTS_PER_BLOCK];
+}
+
+/**
+ * pack_reference_block() - Copy data from a reference block to a buffer ready to be written out.
+ * @block: The block to copy.
+ * @buffer: The char buffer to fill with the packed block.
+ */
+static void pack_reference_block(struct reference_block *block, void *buffer)
+{
+	struct packed_reference_block *packed = buffer;
+	vdo_refcount_t *counters = get_reference_counters_for_block(block);
+	sector_count_t i;
+	struct packed_journal_point commit_point;
+
+	vdo_pack_journal_point(&block->slab->slab_journal_point, &commit_point);
+
+	for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) {
+		packed->sectors[i].commit_point = commit_point;
+		memcpy(packed->sectors[i].counts,
+		       counters + (i * COUNTS_PER_SECTOR),
+		       (sizeof(vdo_refcount_t) * COUNTS_PER_SECTOR));
+	}
+}
+
+static void write_reference_block_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct reference_block *block = vio->completion.parent;
+	thread_id_t thread_id = block->slab->allocator->thread_id;
+
+	continue_vio_after_io(vio, finish_reference_block_write, thread_id);
+}
+
+/**
+ * handle_io_error() - Handle an I/O error reading or writing a reference count block.
+ * @completion: The VIO doing the I/O as a completion.
+ */
+static void handle_io_error(struct vdo_completion *completion)
+{
+	int result = completion->result;
+	struct vio *vio = as_vio(completion);
+	struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab;
+
+	vio_record_metadata_io_error(vio);
+	return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio));
+	slab->active_count--;
+	vdo_enter_read_only_mode(slab->allocator->depot->vdo, result);
+	check_if_slab_drained(slab);
+}
+
+/**
+ * write_reference_block() - After a dirty block waiter has gotten a VIO from the VIO pool, copy
+ *                           its counters and associated data into the VIO, and launch the write.
+ * @waiter: The waiter of the dirty block.
+ * @context: The VIO returned by the pool.
+ */
+static void write_reference_block(struct waiter *waiter, void *context)
+{
+	size_t block_offset;
+	physical_block_number_t pbn;
+	struct pooled_vio *pooled = context;
+	struct vdo_completion *completion = &pooled->vio.completion;
+	struct reference_block *block = container_of(waiter, struct reference_block, waiter);
+
+	pack_reference_block(block, pooled->vio.data);
+	block_offset = (block - block->slab->reference_blocks);
+	pbn = (block->slab->ref_counts_origin + block_offset);
+	block->slab_journal_lock_to_release = block->slab_journal_lock;
+	completion->parent = block;
+
+	/*
+	 * Mark the block as clean, since we won't be committing any updates that happen after this
+	 * moment. As long as VIO order is preserved, two VIOs updating this block at once will not
+	 * cause complications.
+	 */
+	block->is_dirty = false;
+
+	/*
+	 * Flush before writing to ensure that the recovery journal and slab journal entries which
+	 * cover this reference update are stable (VDO-2331).
+	 */
+	WRITE_ONCE(block->slab->allocator->ref_counts_statistics.blocks_written,
+		   block->slab->allocator->ref_counts_statistics.blocks_written + 1);
+
+	completion->callback_thread_id = ((struct block_allocator *) pooled->context)->thread_id;
+	submit_metadata_vio(&pooled->vio,
+			    pbn,
+			    write_reference_block_endio,
+			    handle_io_error,
+			    REQ_OP_WRITE | REQ_PREFLUSH);
+}
+
+static void reclaim_journal_space(struct slab_journal *journal)
+{
+	block_count_t length = journal_length(journal);
+	struct vdo_slab *slab = journal->slab;
+	block_count_t write_count = vdo_count_waiters(&slab->dirty_blocks);
+	block_count_t written;
+
+	if ((length < journal->flushing_threshold) || (write_count == 0))
+		return;
+
+	/* The slab journal is over the first threshold, schedule some reference block writes. */
+	WRITE_ONCE(journal->events->flush_count, journal->events->flush_count + 1);
+	if (length < journal->flushing_deadline)
+		/* Schedule more writes the closer to the deadline we get. */
+		write_count = max_t(block_count_t,
+				    write_count / (journal->flushing_deadline - length + 1),
+				    1);
+
+	for (written = 0; written < write_count; written++)
+		vdo_notify_next_waiter(&slab->dirty_blocks, launch_reference_block_write, slab);
+}
+
+/**
+ * reference_count_to_status() - Convert a reference count to a reference status.
+ * @count: The count to convert.
+ *
+ * Return: The appropriate reference status.
+ */
+static enum reference_status __must_check
+reference_count_to_status(vdo_refcount_t count)
+{
+	if (count == EMPTY_REFERENCE_COUNT)
+		return RS_FREE;
+	else if (count == 1)
+		return RS_SINGLE;
+	else if (count == PROVISIONAL_REFERENCE_COUNT)
+		return RS_PROVISIONAL;
+	else
+		return RS_SHARED;
+}
+
+/**
+ * dirty_block() - Mark a reference count block as dirty, potentially adding it to the dirty queue
+ *                 if it wasn't already dirty.
+ * @block: The reference block to mark as dirty.
+ */
+static void dirty_block(struct reference_block *block)
+{
+	if (block->is_dirty)
+		return;
+
+	block->is_dirty = true;
+	if (!block->is_writing)
+		vdo_enqueue_waiter(&block->slab->dirty_blocks, &block->waiter);
+}
+
+/**
+ * get_reference_block() - Get the reference block that covers the given block index.
+ */
+static struct reference_block * __must_check
+get_reference_block(struct vdo_slab *slab, slab_block_number index)
+{
+	return &slab->reference_blocks[index / COUNTS_PER_BLOCK];
+}
+
+/**
+ * slab_block_number_from_pbn() - Determine the index within the slab of a particular physical
+ *                                block number.
+ * @slab: The slab.
+ * @physical_block_number: The physical block number.
+ * @slab_block_number_ptr: A pointer to the slab block number.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int __must_check
+slab_block_number_from_pbn(struct vdo_slab *slab,
+			   physical_block_number_t physical_block_number,
+			   slab_block_number *slab_block_number_ptr)
+{
+	u64 slab_block_number;
+
+	if (physical_block_number < slab->start)
+		return VDO_OUT_OF_RANGE;
+
+	slab_block_number = physical_block_number - slab->start;
+	if (slab_block_number >= slab->allocator->depot->slab_config.data_blocks)
+		return VDO_OUT_OF_RANGE;
+
+	*slab_block_number_ptr = slab_block_number;
+	return VDO_SUCCESS;
+}
+
+/**
+ * get_reference_counter() - Get the reference counter that covers the given physical block number.
+ * @slab: The slab to query.
+ * @pbn: The physical block number.
+ * @counter_ptr: A pointer to the reference counter.
+ */
+static int __must_check
+get_reference_counter(struct vdo_slab *slab,
+		      physical_block_number_t pbn,
+		      vdo_refcount_t **counter_ptr)
+{
+	slab_block_number index;
+	int result = slab_block_number_from_pbn(slab, pbn, &index);
+
+	if (result != VDO_SUCCESS)
+		return result;
+
+	*counter_ptr = &slab->counters[index];
+
+	return VDO_SUCCESS;
+}
+
+static unsigned int calculate_slab_priority(struct vdo_slab *slab)
+{
+	block_count_t free_blocks = slab->free_blocks;
+	unsigned int unopened_slab_priority = slab->allocator->unopened_slab_priority;
+	unsigned int priority;
+
+	/*
+	 * Wholly full slabs must be the only ones with lowest priority, 0.
+	 *
+	 * Slabs that have never been opened (empty, newly initialized, and never been written to)
+	 * have lower priority than previously opened slabs that have a significant number of free
+	 * blocks. This ranking causes VDO to avoid writing physical blocks for the first time
+	 * unless there are very few free blocks that have been previously written to.
+	 *
+	 * Since VDO doesn't discard blocks currently, reusing previously written blocks makes VDO
+	 * a better client of any underlying storage that is thinly-provisioned (though discarding
+	 * would be better).
+	 *
+	 * For all other slabs, the priority is derived from the logarithm of the number of free
+	 * blocks. Slabs with the same order of magnitude of free blocks have the same priority.
+	 * With 2^23 blocks, the priority will range from 1 to 25. The reserved
+	 * unopened_slab_priority divides the range and is skipped by the logarithmic mapping.
+	 */
+
+	if (free_blocks == 0)
+		return 0;
+
+	if (is_slab_journal_blank(slab))
+		return unopened_slab_priority;
+
+	priority = (1 + ilog2(free_blocks));
+	return ((priority < unopened_slab_priority) ? priority : priority + 1);
+}
+
+/*
+ * Slabs are essentially prioritized by an approximation of the number of free blocks in the slab
+ * so slabs with lots of free blocks with be opened for allocation before slabs that have few free
+ * blocks.
+ */
+static void prioritize_slab(struct vdo_slab *slab)
+{
+	ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
+			"a slab must not already be on a ring when prioritizing");
+	slab->priority = calculate_slab_priority(slab);
+	vdo_priority_table_enqueue(slab->allocator->prioritized_slabs,
+				   slab->priority,
+				   &slab->allocq_entry);
+}
+
+/**
+ * adjust_free_block_count() - Adjust the free block count and (if needed) reprioritize the slab.
+ * @increment: should be true if the free block count went up.
+ */
+static void adjust_free_block_count(struct vdo_slab *slab, bool increment)
+{
+	struct block_allocator *allocator = slab->allocator;
+
+	WRITE_ONCE(allocator->allocated_blocks,
+		   allocator->allocated_blocks + (increment ? -1 : 1));
+
+	/* The open slab doesn't need to be reprioritized until it is closed. */
+	if (slab == allocator->open_slab)
+		return;
+
+	/* Don't bother adjusting the priority table if unneeded. */
+	if (slab->priority == calculate_slab_priority(slab))
+		return;
+
+	/*
+	 * Reprioritize the slab to reflect the new free block count by removing it from the table
+	 * and re-enqueuing it with the new priority.
+	 */
+	vdo_priority_table_remove(allocator->prioritized_slabs, &slab->allocq_entry);
+	prioritize_slab(slab);
+}
+
+/**
+ * increment_for_data() - Increment the reference count for a data block.
+ * @slab: The slab which owns the block.
+ * @block: The reference block which contains the block being updated.
+ * @block_number: The block to update.
+ * @old_status: The reference status of the data block before this increment.
+ * @lock: The pbn_lock associated with this increment (may be NULL).
+ * @counter_ptr: A pointer to the count for the data block (in, out).
+ * @adjust_block_count: Whether to update the allocator's free block count.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int increment_for_data(struct vdo_slab *slab,
+			      struct reference_block *block,
+			      slab_block_number block_number,
+			      enum reference_status old_status,
+			      struct pbn_lock *lock,
+			      vdo_refcount_t *counter_ptr,
+			      bool adjust_block_count)
+{
+	switch (old_status) {
+	case RS_FREE:
+		*counter_ptr = 1;
+		block->allocated_count++;
+		slab->free_blocks--;
+		if (adjust_block_count)
+			adjust_free_block_count(slab, false);
+
+		break;
+
+	case RS_PROVISIONAL:
+		*counter_ptr = 1;
+		break;
+
+	default:
+		/* Single or shared */
+		if (*counter_ptr >= MAXIMUM_REFERENCE_COUNT)
+			return uds_log_error_strerror(VDO_REF_COUNT_INVALID,
+						      "Incrementing a block already having 254 references (slab %u, offset %u)",
+						      slab->slab_number,
+						      block_number);
+		(*counter_ptr)++;
+	}
+
+	if (lock != NULL)
+		vdo_unassign_pbn_lock_provisional_reference(lock);
+	return VDO_SUCCESS;
+}
+
+/**
+ * decrement_for_data() - Decrement the reference count for a data block.
+ * @slab: The slab which owns the block.
+ * @block: The reference block which contains the block being updated.
+ * @block_number: The block to update.
+ * @old_status: The reference status of the data block before this decrement.
+ * @updater: The reference updater doing this operation in case we need to look up the pbn lock.
+ * @lock: The pbn_lock associated with the block being decremented (may be NULL).
+ * @counter_ptr: A pointer to the count for the data block (in, out).
+ * @adjust_block_count: Whether to update the allocator's free block count.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int decrement_for_data(struct vdo_slab *slab,
+			      struct reference_block *block,
+			      slab_block_number block_number,
+			      enum reference_status old_status,
+			      struct reference_updater *updater,
+			      vdo_refcount_t *counter_ptr,
+			      bool adjust_block_count)
+{
+	switch (old_status) {
+	case RS_FREE:
+		return uds_log_error_strerror(VDO_REF_COUNT_INVALID,
+					      "Decrementing free block at offset %u in slab %u",
+					      block_number,
+					      slab->slab_number);
+
+	case RS_PROVISIONAL:
+	case RS_SINGLE:
+		if (updater->zpbn.zone != NULL) {
+			struct pbn_lock *lock = vdo_get_physical_zone_pbn_lock(updater->zpbn.zone,
+									       updater->zpbn.pbn);
+
+			if (lock != NULL) {
+				/*
+				 * There is a read lock on this block, so the block must not become
+				 * unreferenced.
+				 */
+				*counter_ptr = PROVISIONAL_REFERENCE_COUNT;
+				vdo_assign_pbn_lock_provisional_reference(lock);
+				break;
+			}
+		}
+
+		*counter_ptr = EMPTY_REFERENCE_COUNT;
+		block->allocated_count--;
+		slab->free_blocks++;
+		if (adjust_block_count)
+			adjust_free_block_count(slab, true);
+
+		break;
+
+	default:
+		/* Shared */
+		(*counter_ptr)--;
+	}
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * increment_for_block_map() - Increment the reference count for a block map page.
+ * @slab: The slab which owns the block.
+ * @block: The reference block which contains the block being updated.
+ * @block_number: The block to update.
+ * @old_status: The reference status of the block before this increment.
+ * @lock: The pbn_lock associated with this increment (may be NULL).
+ * @normal_operation: Whether we are in normal operation vs. recovery or rebuild.
+ * @counter_ptr: A pointer to the count for the block (in, out).
+ * @adjust_block_count: Whether to update the allocator's free block count.
+ *
+ * All block map increments should be from provisional to MAXIMUM_REFERENCE_COUNT. Since block map
+ * blocks never dedupe they should never be adjusted from any other state. The adjustment always
+ * results in MAXIMUM_REFERENCE_COUNT as this value is used to prevent dedupe against block map
+ * blocks.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int increment_for_block_map(struct vdo_slab *slab,
+				   struct reference_block *block,
+				   slab_block_number block_number,
+				   enum reference_status old_status,
+				   struct pbn_lock *lock,
+				   bool normal_operation,
+				   vdo_refcount_t *counter_ptr,
+				   bool adjust_block_count)
+{
+	switch (old_status) {
+	case RS_FREE:
+		if (normal_operation)
+			return uds_log_error_strerror(VDO_REF_COUNT_INVALID,
+						      "Incrementing unallocated block map block (slab %u, offset %u)",
+						      slab->slab_number,
+						      block_number);
+
+		*counter_ptr = MAXIMUM_REFERENCE_COUNT;
+		block->allocated_count++;
+		slab->free_blocks--;
+		if (adjust_block_count)
+			adjust_free_block_count(slab, false);
+
+		return VDO_SUCCESS;
+
+	case RS_PROVISIONAL:
+		if (!normal_operation)
+			return uds_log_error_strerror(VDO_REF_COUNT_INVALID,
+						      "Block map block had provisional reference during replay (slab %u, offset %u)",
+						      slab->slab_number,
+						      block_number);
+
+		*counter_ptr = MAXIMUM_REFERENCE_COUNT;
+		if (lock != NULL)
+			vdo_unassign_pbn_lock_provisional_reference(lock);
+		return VDO_SUCCESS;
+
+	default:
+		return uds_log_error_strerror(VDO_REF_COUNT_INVALID,
+					      "Incrementing a block map block which is already referenced %u times (slab %u, offset %u)",
+					      *counter_ptr,
+					      slab->slab_number,
+					      block_number);
+	}
+}
+
+static bool __must_check is_valid_journal_point(const struct journal_point *point)
+{
+	return ((point != NULL) && (point->sequence_number > 0));
+}
+
+/**
+ * update_reference_count() - Update the reference count of a block.
+ * @slab: The slab which owns the block.
+ * @block: The reference block which contains the block being updated.
+ * @block_number: The block to update.
+ * @slab_journal_point: The slab journal point at which this update is journaled.
+ * @updater: The reference updater.
+ * @normal_operation: Whether we are in normal operation vs. recovery or rebuild.
+ * @adjust_block_count: Whether to update the slab's free block count.
+ * @provisional_decrement_ptr: A pointer which will be set to true if this update was a decrement
+ *                             of a provisional reference.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int
+update_reference_count(struct vdo_slab *slab,
+		       struct reference_block *block,
+		       slab_block_number block_number,
+		       const struct journal_point *slab_journal_point,
+		       struct reference_updater *updater,
+		       bool normal_operation,
+		       bool adjust_block_count,
+		       bool *provisional_decrement_ptr)
+{
+	vdo_refcount_t *counter_ptr = &slab->counters[block_number];
+	enum reference_status old_status = reference_count_to_status(*counter_ptr);
+	int result;
+
+	if (!updater->increment) {
+		result = decrement_for_data(slab,
+					    block,
+					    block_number,
+					    old_status,
+					    updater,
+					    counter_ptr,
+					    adjust_block_count);
+		if ((result == VDO_SUCCESS) && (old_status == RS_PROVISIONAL)) {
+			if (provisional_decrement_ptr != NULL)
+				*provisional_decrement_ptr = true;
+			return VDO_SUCCESS;
+		}
+	} else if (updater->operation == VDO_JOURNAL_DATA_REMAPPING) {
+		result = increment_for_data(slab,
+					    block,
+					    block_number,
+					    old_status,
+					    updater->lock,
+					    counter_ptr,
+					    adjust_block_count);
+	} else {
+		result = increment_for_block_map(slab,
+						 block,
+						 block_number,
+						 old_status,
+						 updater->lock,
+						 normal_operation,
+						 counter_ptr,
+						 adjust_block_count);
+	}
+
+	if (result != VDO_SUCCESS)
+		return result;
+
+	if (is_valid_journal_point(slab_journal_point))
+		slab->slab_journal_point = *slab_journal_point;
+
+	return VDO_SUCCESS;
+}
+
+static int __must_check
+adjust_reference_count(struct vdo_slab *slab,
+		       struct reference_updater *updater,
+		       const struct journal_point *slab_journal_point)
+{
+	slab_block_number block_number;
+	int result;
+	struct reference_block *block;
+	bool provisional_decrement = false;
+
+	if (!is_slab_open(slab))
+		return VDO_INVALID_ADMIN_STATE;
+
+	result = slab_block_number_from_pbn(slab, updater->zpbn.pbn, &block_number);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	block = get_reference_block(slab, block_number);
+	result = update_reference_count(slab,
+					block,
+					block_number,
+					slab_journal_point,
+					updater,
+					NORMAL_OPERATION,
+					true,
+					&provisional_decrement);
+	if ((result != VDO_SUCCESS) || provisional_decrement)
+		return result;
+
+	if (block->is_dirty && (block->slab_journal_lock > 0)) {
+		sequence_number_t entry_lock = slab_journal_point->sequence_number;
+		/*
+		 * This block is already dirty and a slab journal entry has been made for it since
+		 * the last time it was clean. We must release the per-entry slab journal lock for
+		 * the entry associated with the update we are now doing.
+		 */
+		result = ASSERT(is_valid_journal_point(slab_journal_point),
+				"Reference count adjustments need slab journal points.");
+		if (result != VDO_SUCCESS)
+			return result;
+
+		adjust_slab_journal_block_reference(&slab->journal, entry_lock, -1);
+		return VDO_SUCCESS;
+	}
+
+	/*
+	 * This may be the first time we are applying an update for which there is a slab journal
+	 * entry to this block since the block was cleaned. Therefore, we convert the per-entry
+	 * slab journal lock to an uncommitted reference block lock, if there is a per-entry lock.
+	 */
+	if (is_valid_journal_point(slab_journal_point))
+		block->slab_journal_lock = slab_journal_point->sequence_number;
+	else
+		block->slab_journal_lock = 0;
+
+	dirty_block(block);
+	return VDO_SUCCESS;
+}
+
+/**
+ * add_entry_from_waiter() - Add an entry to the slab journal.
+ * @waiter: The vio which should make an entry now.
+ * @context: The slab journal to make an entry in.
+ *
+ * This callback is invoked by add_entries() once it has determined that we are ready to make
+ * another entry in the slab journal. Implements waiter_callback.
+ */
+static void add_entry_from_waiter(struct waiter *waiter, void *context)
+{
+	int result;
+	struct reference_updater *updater = container_of(waiter, struct reference_updater, waiter);
+	struct data_vio *data_vio = data_vio_from_reference_updater(updater);
+	struct slab_journal *journal = context;
+	struct slab_journal_block_header *header = &journal->tail_header;
+	struct journal_point slab_journal_point = {
+		.sequence_number = header->sequence_number,
+		.entry_count = header->entry_count,
+	};
+	sequence_number_t recovery_block = data_vio->recovery_journal_point.sequence_number;
+
+	if (header->entry_count == 0) {
+		/*
+		 * This is the first entry in the current tail block, so get a lock on the recovery
+		 * journal which we will hold until this tail block is committed.
+		 */
+		get_lock(journal, header->sequence_number)->recovery_start = recovery_block;
+		if (journal->recovery_journal != NULL) {
+			zone_count_t zone_number = journal->slab->allocator->zone_number;
+
+			vdo_acquire_recovery_journal_block_reference(journal->recovery_journal,
+								     recovery_block,
+								     VDO_ZONE_TYPE_PHYSICAL,
+								     zone_number);
+		}
+
+		mark_slab_journal_dirty(journal, recovery_block);
+		reclaim_journal_space(journal);
+	}
+
+	add_entry(journal,
+		  updater->zpbn.pbn,
+		  updater->operation,
+		  updater->increment,
+		  expand_journal_point(data_vio->recovery_journal_point, updater->increment));
+
+	if (journal->slab->status != VDO_SLAB_REBUILT) {
+		/*
+		 * If the slab is unrecovered, scrubbing will take care of the count since the
+		 * update is now recorded in the journal.
+		 */
+		adjust_slab_journal_block_reference(journal,
+						    slab_journal_point.sequence_number,
+						    -1);
+		result = VDO_SUCCESS;
+	} else {
+		/* Now that an entry has been made in the slab journal, update the counter. */
+		result = adjust_reference_count(journal->slab, updater, &slab_journal_point);
+	}
+
+	if (updater->increment)
+		continue_data_vio_with_error(data_vio, result);
+	else
+		vdo_continue_completion(&data_vio->decrement_completion, result);
+}
+
+/**
+ * is_next_entry_a_block_map_increment() - Check whether the next entry to be made is a block map
+ *                                         increment.
+ * @journal: The journal.
+ *
+ * Return: true if the first entry waiter's operation is a block map increment.
+ */
+static inline bool is_next_entry_a_block_map_increment(struct slab_journal *journal)
+{
+	struct waiter *waiter = vdo_get_first_waiter(&journal->entry_waiters);
+	struct reference_updater *updater = container_of(waiter, struct reference_updater, waiter);
+
+	return (updater->operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING);
+}
+
+/**
+ * add_entries() - Add as many entries as possible from the queue of vios waiting to make entries.
+ * @journal: The journal to which entries may be added.
+ *
+ * By processing the queue in order, we ensure that slab journal entries are made in the same order
+ * as recovery journal entries for the same increment or decrement.
+ */
+static void add_entries(struct slab_journal *journal)
+{
+	if (journal->adding_entries)
+		/* Protect against re-entrancy. */
+		return;
+
+	journal->adding_entries = true;
+	while (vdo_has_waiters(&journal->entry_waiters)) {
+		struct slab_journal_block_header *header = &journal->tail_header;
+
+		if (journal->partial_write_in_progress ||
+		    (journal->slab->status == VDO_SLAB_REBUILDING))
+			/*
+			 * Don't add entries while rebuilding or while a partial write is
+			 * outstanding (VDO-2399).
+			 */
+			break;
+
+		if (journal->waiting_to_commit) {
+			/*
+			 * If we are waiting for resources to write the tail block, and the tail
+			 * block is full, we can't make another entry.
+			 */
+			WRITE_ONCE(journal->events->tail_busy_count,
+				   journal->events->tail_busy_count + 1);
+			break;
+		} else if (is_next_entry_a_block_map_increment(journal) &&
+			   (header->entry_count >= journal->full_entries_per_block)) {
+			/*
+			 * The tail block does not have room for a block map increment, so commit
+			 * it now.
+			 */
+			commit_tail(journal);
+			if (journal->waiting_to_commit) {
+				WRITE_ONCE(journal->events->tail_busy_count,
+					   journal->events->tail_busy_count + 1);
+				break;
+			}
+		}
+
+		/* If the slab is over the blocking threshold, make the vio wait. */
+		if (requires_reaping(journal)) {
+			WRITE_ONCE(journal->events->blocked_count,
+				   journal->events->blocked_count + 1);
+			save_dirty_reference_blocks(journal->slab);
+			break;
+		}
+
+		if (header->entry_count == 0) {
+			struct journal_lock *lock = get_lock(journal, header->sequence_number);
+
+			/*
+			 * Check if the on disk slab journal is full. Because of the blocking and
+			 * scrubbing thresholds, this should never happen.
+			 */
+			if (lock->count > 0) {
+				ASSERT_LOG_ONLY((journal->head + journal->size) == journal->tail,
+						"New block has locks, but journal is not full");
+
+				/*
+				 * The blocking threshold must let the journal fill up if the new
+				 * block has locks; if the blocking threshold is smaller than the
+				 * journal size, the new block cannot possibly have locks already.
+				 */
+				ASSERT_LOG_ONLY((journal->blocking_threshold >= journal->size),
+						"New block can have locks already iff blocking threshold is at the end of the journal");
+
+				WRITE_ONCE(journal->events->disk_full_count,
+					   journal->events->disk_full_count + 1);
+				save_dirty_reference_blocks(journal->slab);
+				break;
+			}
+
+			/*
+			 * Don't allow the new block to be reaped until all of the reference count
+			 * blocks are written and the journal block has been fully committed as
+			 * well.
+			 */
+			lock->count = journal->entries_per_block + 1;
+
+			if (header->sequence_number == 1) {
+				struct vdo_slab *slab = journal->slab;
+				block_count_t i;
+
+				/*
+				 * This is the first entry in this slab journal, ever. Dirty all of
+				 * the reference count blocks. Each will acquire a lock on the tail
+				 * block so that the journal won't be reaped until the reference
+				 * counts are initialized. The lock acquisition must be done by the
+				 * ref_counts since here we don't know how many reference blocks
+				 * the ref_counts has.
+				 */
+				for (i = 0; i < slab->reference_block_count; i++) {
+					slab->reference_blocks[i].slab_journal_lock = 1;
+					dirty_block(&slab->reference_blocks[i]);
+				}
+
+				adjust_slab_journal_block_reference(journal,
+								    1,
+								    slab->reference_block_count);
+			}
+		}
+
+		vdo_notify_next_waiter(&journal->entry_waiters, add_entry_from_waiter, journal);
+	}
+
+	journal->adding_entries = false;
+
+	/* If there are no waiters, and we are flushing or saving, commit the tail block. */
+	if (vdo_is_state_draining(&journal->slab->state) &&
+	    !vdo_is_state_suspending(&journal->slab->state) &&
+	    !vdo_has_waiters(&journal->entry_waiters))
+		commit_tail(journal);
+}
+
+/**
+ * reset_search_cursor() - Reset the free block search back to the first reference counter in the
+ *                         first reference block of a slab.
+ */
+static void reset_search_cursor(struct vdo_slab *slab)
+{
+	struct search_cursor *cursor = &slab->search_cursor;
+
+	cursor->block = cursor->first_block;
+	cursor->index = 0;
+	/* Unit tests have slabs with only one reference block (and it's a runt). */
+	cursor->end_index = min_t(u32, COUNTS_PER_BLOCK, slab->block_count);
+}
+
+/**
+ * advance_search_cursor() - Advance the search cursor to the start of the next reference block in
+ *                           a slab,
+ *
+ * Wraps around to the first reference block if the current block is the last reference block.
+ *
+ * Return: true unless the cursor was at the last reference block.
+ */
+static bool advance_search_cursor(struct vdo_slab *slab)
+{
+	struct search_cursor *cursor = &slab->search_cursor;
+
+	/*
+	 * If we just finished searching the last reference block, then wrap back around to the
+	 * start of the array.
+	 */
+	if (cursor->block == cursor->last_block) {
+		reset_search_cursor(slab);
+		return false;
+	}
+
+	/* We're not already at the end, so advance to cursor to the next block. */
+	cursor->block++;
+	cursor->index = cursor->end_index;
+
+	if (cursor->block == cursor->last_block)
+		/* The last reference block will usually be a runt. */
+		cursor->end_index = slab->block_count;
+	else
+		cursor->end_index += COUNTS_PER_BLOCK;
+	return true;
+}
+
+/**
+ * vdo_adjust_reference_count_for_rebuild() - Adjust the reference count of a block during rebuild.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot,
+					   physical_block_number_t pbn,
+					   enum journal_operation operation)
+{
+	int result;
+	slab_block_number block_number;
+	struct reference_block *block;
+	struct vdo_slab *slab = vdo_get_slab(depot, pbn);
+	struct reference_updater updater = {
+		.operation = operation,
+		.increment = true,
+	};
+
+	result = slab_block_number_from_pbn(slab, pbn, &block_number);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	block = get_reference_block(slab, block_number);
+	result = update_reference_count(slab,
+					block,
+					block_number,
+					NULL,
+					&updater,
+					!NORMAL_OPERATION,
+					false,
+					NULL);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	dirty_block(block);
+	return VDO_SUCCESS;
+}
+
+/**
+ * replay_reference_count_change() - Replay the reference count adjustment from a slab journal
+ *                                   entry into the reference count for a block.
+ * @slab: The slab.
+ * @entry_point: The slab journal point for the entry.
+ * @entry: The slab journal entry being replayed.
+ *
+ * The adjustment will be ignored if it was already recorded in the reference count.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int
+replay_reference_count_change(struct vdo_slab *slab,
+			      const struct journal_point *entry_point,
+			      struct slab_journal_entry entry)
+{
+	int result;
+	struct reference_block *block = get_reference_block(slab, entry.sbn);
+	sector_count_t sector = (entry.sbn % COUNTS_PER_BLOCK) / COUNTS_PER_SECTOR;
+	struct reference_updater updater = {
+		.operation = entry.operation,
+		.increment = entry.increment,
+	};
+
+	if (!vdo_before_journal_point(&block->commit_points[sector], entry_point))
+		/* This entry is already reflected in the existing counts, so do nothing. */
+		return VDO_SUCCESS;
+
+	/* This entry is not yet counted in the reference counts. */
+	result = update_reference_count(slab,
+					block,
+					entry.sbn,
+					entry_point,
+					&updater,
+					!NORMAL_OPERATION,
+					false,
+					NULL);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	dirty_block(block);
+	return VDO_SUCCESS;
+}
+
+/**
+ * find_zero_byte_in_word() - Find the array index of the first zero byte in word-sized range of
+ *                            reference counters.
+ * @word_ptr: A pointer to the eight counter bytes to check.
+ * @start_index: The array index corresponding to word_ptr[0].
+ * @fail_index: The array index to return if no zero byte is found.
+ *
+ * The search does no bounds checking; the function relies on the array being sufficiently padded.
+ *
+ * Return: The array index of the first zero byte in the word, or the value passed as fail_index if
+ *         no zero byte was found.
+ */
+static inline slab_block_number
+find_zero_byte_in_word(const u8 *word_ptr,
+		       slab_block_number start_index,
+		       slab_block_number fail_index)
+{
+	u64 word = get_unaligned_le64(word_ptr);
+
+	/* This looks like a loop, but GCC will unroll the eight iterations for us. */
+	unsigned int offset;
+
+	for (offset = 0; offset < BYTES_PER_WORD; offset++) {
+		/* Assumes little-endian byte order, which we have on X86. */
+		if ((word & 0xFF) == 0)
+			return (start_index + offset);
+		word >>= 8;
+	}
+
+	return fail_index;
+}
+
+/**
+ * vdo_find_free_block() - Find the first block with a reference count of zero in the specified
+ *                         range of reference counter indexes.
+ * @slab: The slab counters to scan.
+ * @index_ptr: A pointer to hold the array index of the free block.
+ *
+ * Exposed for unit testing.
+ *
+ * Return: true if a free block was found in the specified range.
+ */
+static bool
+find_free_block(const struct vdo_slab *slab, slab_block_number *index_ptr)
+{
+	slab_block_number zero_index;
+	slab_block_number next_index = slab->search_cursor.index;
+	slab_block_number end_index = slab->search_cursor.end_index;
+	u8 *next_counter = &slab->counters[next_index];
+	u8 *end_counter = &slab->counters[end_index];
+
+	/*
+	 * Search every byte of the first unaligned word. (Array is padded so reading past end is
+	 * safe.)
+	 */
+	zero_index = find_zero_byte_in_word(next_counter, next_index, end_index);
+	if (zero_index < end_index) {
+		*index_ptr = zero_index;
+		return true;
+	}
+
+	/*
+	 * On architectures where unaligned word access is expensive, this would be a good place to
+	 * advance to an alignment boundary.
+	 */
+	next_index += BYTES_PER_WORD;
+	next_counter += BYTES_PER_WORD;
+
+	/*
+	 * Now we're word-aligned; check an word at a time until we find a word containing a zero.
+	 * (Array is padded so reading past end is safe.)
+	 */
+	while (next_counter < end_counter) {
+		/*
+		 * The following code is currently an exact copy of the code preceding the loop,
+		 * but if you try to merge them by using a do loop, it runs slower because a jump
+		 * instruction gets added at the start of the iteration.
+		 */
+		zero_index = find_zero_byte_in_word(next_counter, next_index, end_index);
+		if (zero_index < end_index) {
+			*index_ptr = zero_index;
+			return true;
+		}
+
+		next_index += BYTES_PER_WORD;
+		next_counter += BYTES_PER_WORD;
+	}
+
+	return false;
+}
+
+/**
+ * search_current_reference_block() - Search the reference block currently saved in the search
+ *                                    cursor for a reference count of zero, starting at the saved
+ *                                    counter index.
+ * @slab: The slab to search.
+ * @free_index_ptr: A pointer to receive the array index of the zero reference count.
+ *
+ * Return: true if an unreferenced counter was found.
+ */
+static bool search_current_reference_block(const struct vdo_slab *slab,
+					   slab_block_number *free_index_ptr)
+{
+	/* Don't bother searching if the current block is known to be full. */
+	return ((slab->search_cursor.block->allocated_count < COUNTS_PER_BLOCK) &&
+		find_free_block(slab, free_index_ptr));
+}
+
+/**
+ * search_reference_blocks() - Search each reference block for a reference count of zero.
+ * @slab: The slab to search.
+ * @free_index_ptr: A pointer to receive the array index of the zero reference count.
+ *
+ * Searches each reference block for a reference count of zero, starting at the reference block and
+ * counter index saved in the search cursor and searching up to the end of the last reference
+ * block. The search does not wrap.
+ *
+ * Return: true if an unreferenced counter was found.
+ */
+static bool
+search_reference_blocks(struct vdo_slab *slab, slab_block_number *free_index_ptr)
+{
+	/* Start searching at the saved search position in the current block. */
+	if (search_current_reference_block(slab, free_index_ptr))
+		return true;
+
+	/* Search each reference block up to the end of the slab. */
+	while (advance_search_cursor(slab))
+		if (search_current_reference_block(slab, free_index_ptr))
+			return true;
+
+	return false;
+}
+
+/**
+ * make_provisional_reference() - Do the bookkeeping for making a provisional reference.
+ */
+static void
+make_provisional_reference(struct vdo_slab *slab, slab_block_number block_number)
+{
+	struct reference_block *block = get_reference_block(slab, block_number);
+
+	/*
+	 * Make the initial transition from an unreferenced block to a
+	 * provisionally allocated block.
+	 */
+	slab->counters[block_number] = PROVISIONAL_REFERENCE_COUNT;
+
+	/* Account for the allocation. */
+	block->allocated_count++;
+	slab->free_blocks--;
+}
+
+/**
+ * dirty_all_reference_blocks() - Mark all reference count blocks in a slab as dirty.
+ */
+static void dirty_all_reference_blocks(struct vdo_slab *slab)
+{
+	block_count_t i;
+
+	for (i = 0; i < slab->reference_block_count; i++)
+		dirty_block(&slab->reference_blocks[i]);
+}
+
+/**
+ * clear_provisional_references() - Clear the provisional reference counts from a reference block.
+ * @block: The block to clear.
+ */
+static void clear_provisional_references(struct reference_block *block)
+{
+	vdo_refcount_t *counters = get_reference_counters_for_block(block);
+	block_count_t j;
+
+	for (j = 0; j < COUNTS_PER_BLOCK; j++) {
+		if (counters[j] == PROVISIONAL_REFERENCE_COUNT) {
+			counters[j] = EMPTY_REFERENCE_COUNT;
+			block->allocated_count--;
+		}
+	}
+}
+
+static inline bool journal_points_equal(struct journal_point first, struct journal_point second)
+{
+	return ((first.sequence_number == second.sequence_number) &&
+		(first.entry_count == second.entry_count));
+}
+
+/**
+ * unpack_reference_block() - Unpack reference counts blocks into the internal memory structure.
+ * @packed: The written reference block to be unpacked.
+ * @block: The internal reference block to be loaded.
+ */
+static void
+unpack_reference_block(struct packed_reference_block *packed, struct reference_block *block)
+{
+	block_count_t index;
+	sector_count_t i;
+	struct vdo_slab *slab = block->slab;
+	vdo_refcount_t *counters = get_reference_counters_for_block(block);
+
+	for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) {
+		struct packed_reference_sector *sector = &packed->sectors[i];
+
+		vdo_unpack_journal_point(&sector->commit_point, &block->commit_points[i]);
+		memcpy(counters + (i * COUNTS_PER_SECTOR),
+		       sector->counts,
+		       (sizeof(vdo_refcount_t) * COUNTS_PER_SECTOR));
+		/* The slab_journal_point must be the latest point found in any sector. */
+		if (vdo_before_journal_point(&slab->slab_journal_point, &block->commit_points[i]))
+			slab->slab_journal_point = block->commit_points[i];
+
+		if ((i > 0) &&
+		    !journal_points_equal(block->commit_points[0], block->commit_points[i])) {
+			size_t block_index = block - block->slab->reference_blocks;
+
+			uds_log_warning("Torn write detected in sector %u of reference block %zu of slab %u",
+					i,
+					block_index,
+					block->slab->slab_number);
+		}
+	}
+
+	block->allocated_count = 0;
+	for (index = 0; index < COUNTS_PER_BLOCK; index++)
+		if (counters[index] != EMPTY_REFERENCE_COUNT)
+			block->allocated_count++;
+}
+
+/**
+ * finish_reference_block_load() - After a reference block has been read, unpack it.
+ * @completion: The VIO that just finished reading.
+ */
+static void finish_reference_block_load(struct vdo_completion *completion)
+{
+	struct vio *vio = as_vio(completion);
+	struct pooled_vio *pooled = vio_as_pooled_vio(vio);
+	struct reference_block *block = completion->parent;
+	struct vdo_slab *slab = block->slab;
+
+	unpack_reference_block((struct packed_reference_block *) vio->data, block);
+	return_vio_to_pool(slab->allocator->vio_pool, pooled);
+	slab->active_count--;
+	clear_provisional_references(block);
+
+	slab->free_blocks -= block->allocated_count;
+	check_if_slab_drained(slab);
+}
+
+static void load_reference_block_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct reference_block *block = vio->completion.parent;
+
+	continue_vio_after_io(vio, finish_reference_block_load, block->slab->allocator->thread_id);
+}
+
+/**
+ * load_reference_block() - After a block waiter has gotten a VIO from the VIO pool, load the
+ *                          block.
+ * @waiter: The waiter of the block to load.
+ * @context: The VIO returned by the pool.
+ */
+static void load_reference_block(struct waiter *waiter, void *context)
+{
+	struct pooled_vio *pooled = context;
+	struct vio *vio = &pooled->vio;
+	struct reference_block *block = container_of(waiter, struct reference_block, waiter);
+	size_t block_offset = (block - block->slab->reference_blocks);
+
+	vio->completion.parent = block;
+	submit_metadata_vio(vio,
+			    block->slab->ref_counts_origin + block_offset,
+			    load_reference_block_endio,
+			    handle_io_error,
+			    REQ_OP_READ);
+}
+
+/**
+ * load_reference_blocks() - Load a slab's reference blocks from the underlying storage into a
+ *                           pre-allocated reference counter.
+ */
+static void load_reference_blocks(struct vdo_slab *slab)
+{
+	block_count_t i;
+
+	slab->free_blocks = slab->block_count;
+	slab->active_count = slab->reference_block_count;
+	for (i = 0; i < slab->reference_block_count; i++) {
+		struct waiter *waiter = &slab->reference_blocks[i].waiter;
+
+		waiter->callback = load_reference_block;
+		acquire_vio_from_pool(slab->allocator->vio_pool, waiter);
+	}
+}
+
+/**
+ * drain_slab() - Drain all reference count I/O.
+ *
+ * Depending upon the type of drain being performed (as recorded in the ref_count's vdo_slab), the
+ * reference blocks may be loaded from disk or dirty reference blocks may be written out.
+ */
+static void drain_slab(struct vdo_slab *slab)
+{
+	bool save;
+	bool load;
+	const struct admin_state_code *state = vdo_get_admin_state_code(&slab->state);
+
+	if (state == VDO_ADMIN_STATE_SUSPENDING)
+		return;
+
+	if ((state != VDO_ADMIN_STATE_REBUILDING) && (state != VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING))
+		commit_tail(&slab->journal);
+
+	if ((state == VDO_ADMIN_STATE_RECOVERING) || (slab->counters == NULL))
+		return;
+
+	save = false;
+	load = slab->allocator->summary_entries[slab->slab_number].load_ref_counts;
+	if (state == VDO_ADMIN_STATE_SCRUBBING) {
+		if (load) {
+			load_reference_blocks(slab);
+			return;
+		}
+	} else if (state == VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING) {
+		if (!load)
+			/* These reference counts were never written, so mark them all dirty. */
+			dirty_all_reference_blocks(slab);
+
+		save = true;
+	} else if (state == VDO_ADMIN_STATE_REBUILDING) {
+		/*
+		 * Write out the counters if the slab has written them before, or it has any
+		 * non-zero reference counts, or there are any slab journal blocks.
+		 */
+		block_count_t data_blocks = slab->allocator->depot->slab_config.data_blocks;
+
+		if (load ||
+		    (slab->free_blocks != data_blocks) ||
+		    !is_slab_journal_blank(slab)) {
+			dirty_all_reference_blocks(slab);
+			save = true;
+		}
+	} else if (state == VDO_ADMIN_STATE_SAVING) {
+		save = (slab->status == VDO_SLAB_REBUILT);
+	} else {
+		vdo_finish_draining_with_result(&slab->state, VDO_SUCCESS);
+		return;
+	}
+
+	if (save)
+		save_dirty_reference_blocks(slab);
+}
+
+static int allocate_slab_counters(struct vdo_slab *slab)
+{
+	int result;
+	size_t index, bytes;
+
+	result = ASSERT(slab->reference_blocks == NULL,
+			"vdo_slab %u doesn't allocate refcounts twice",
+			slab->slab_number);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = UDS_ALLOCATE(slab->reference_block_count,
+			      struct reference_block,
+			      __func__,
+			      &slab->reference_blocks);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	/*
+	 * Allocate such that the runt slab has a full-length memory array, plus a little padding
+	 * so we can word-search even at the very end.
+	 */
+	bytes = (slab->reference_block_count * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD);
+	result = UDS_ALLOCATE(bytes, vdo_refcount_t, "ref counts array", &slab->counters);
+	if (result != UDS_SUCCESS) {
+		UDS_FREE(UDS_FORGET(slab->reference_blocks));
+		return result;
+	}
+
+	slab->search_cursor.first_block = slab->reference_blocks;
+	slab->search_cursor.last_block = &slab->reference_blocks[slab->reference_block_count - 1];
+	reset_search_cursor(slab);
+
+	for (index = 0; index < slab->reference_block_count; index++) {
+		slab->reference_blocks[index] = (struct reference_block) {
+			.slab = slab,
+		};
+	}
+
+	return VDO_SUCCESS;
+}
+
+static int allocate_counters_if_clean(struct vdo_slab *slab)
+{
+	if (vdo_is_state_clean_load(&slab->state))
+		return allocate_slab_counters(slab);
+
+	return VDO_SUCCESS;
+}
+
+static void finish_loading_journal(struct vdo_completion *completion)
+{
+	struct vio *vio = as_vio(completion);
+	struct slab_journal *journal = completion->parent;
+	struct vdo_slab *slab = journal->slab;
+	struct packed_slab_journal_block *block = (struct packed_slab_journal_block *) vio->data;
+	struct slab_journal_block_header header;
+
+	vdo_unpack_slab_journal_block_header(&block->header, &header);
+
+	/* FIXME: should it be an error if the following conditional fails? */
+	if ((header.metadata_type == VDO_METADATA_SLAB_JOURNAL) &&
+	    (header.nonce == slab->allocator->nonce)) {
+		journal->tail = header.sequence_number + 1;
+
+		/*
+		 * If the slab is clean, this implies the slab journal is empty, so advance the
+		 * head appropriately.
+		 */
+		journal->head = (slab->allocator->summary_entries[slab->slab_number].is_dirty ?
+				 header.head :
+				 journal->tail);
+		journal->tail_header = header;
+		initialize_journal_state(journal);
+	}
+
+	return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio));
+	vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab));
+}
+
+static void read_slab_journal_tail_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct slab_journal *journal = vio->completion.parent;
+
+	continue_vio_after_io(vio, finish_loading_journal, journal->slab->allocator->thread_id);
+}
+
+static void handle_load_error(struct vdo_completion *completion)
+{
+	int result = completion->result;
+	struct slab_journal *journal = completion->parent;
+	struct vio *vio = as_vio(completion);
+
+	vio_record_metadata_io_error(vio);
+	return_vio_to_pool(journal->slab->allocator->vio_pool, vio_as_pooled_vio(vio));
+	vdo_finish_loading_with_result(&journal->slab->state, result);
+}
+
+/**
+ * read_slab_journal_tail() - Read the slab journal tail block by using a vio acquired from the vio
+ *                            pool.
+ * @waiter: The vio pool waiter which has just been notified.
+ * @context: The vio pool entry given to the waiter.
+ *
+ * This is the success callback from acquire_vio_from_pool() when loading a slab journal.
+ */
+static void read_slab_journal_tail(struct waiter *waiter, void *context)
+{
+	struct slab_journal *journal = container_of(waiter, struct slab_journal, resource_waiter);
+	struct vdo_slab *slab = journal->slab;
+	struct pooled_vio *pooled = context;
+	struct vio *vio = &pooled->vio;
+	tail_block_offset_t last_commit_point =
+		slab->allocator->summary_entries[slab->slab_number].tail_block_offset;
+
+	/*
+	 * Slab summary keeps the commit point offset, so the tail block is the block before that.
+	 * Calculation supports small journals in unit tests.
+	 */
+	tail_block_offset_t tail_block = ((last_commit_point == 0) ?
+					  (tail_block_offset_t)(journal->size - 1) :
+					  (last_commit_point - 1));
+
+	vio->completion.parent = journal;
+	vio->completion.callback_thread_id = slab->allocator->thread_id;
+	submit_metadata_vio(vio,
+			    slab->journal_origin + tail_block,
+			    read_slab_journal_tail_endio,
+			    handle_load_error,
+			    REQ_OP_READ);
+}
+
+/**
+ * load_slab_journal() - Load a slab's journal by reading the journal's tail.
+ */
+static void load_slab_journal(struct vdo_slab *slab)
+{
+	struct slab_journal *journal = &slab->journal;
+	tail_block_offset_t last_commit_point;
+
+	last_commit_point = slab->allocator->summary_entries[slab->slab_number].tail_block_offset;
+	if ((last_commit_point == 0) &&
+	    !slab->allocator->summary_entries[slab->slab_number].load_ref_counts) {
+		/*
+		 * This slab claims that it has a tail block at (journal->size - 1), but a head of
+		 * 1. This is impossible, due to the scrubbing threshold, on a real system, so
+		 * don't bother reading the (bogus) data off disk.
+		 */
+		ASSERT_LOG_ONLY(((journal->size < 16) ||
+				 (journal->scrubbing_threshold < (journal->size - 1))),
+				"Scrubbing threshold protects against reads of unwritten slab journal blocks");
+		vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab));
+		return;
+	}
+
+	journal->resource_waiter.callback = read_slab_journal_tail;
+	acquire_vio_from_pool(slab->allocator->vio_pool, &journal->resource_waiter);
+}
+
+static void register_slab_for_scrubbing(struct vdo_slab *slab, bool high_priority)
+{
+	struct slab_scrubber *scrubber = &slab->allocator->scrubber;
+
+	ASSERT_LOG_ONLY((slab->status != VDO_SLAB_REBUILT), "slab to be scrubbed is unrecovered");
+
+	if (slab->status != VDO_SLAB_REQUIRES_SCRUBBING)
+		return;
+
+	list_del_init(&slab->allocq_entry);
+	if (!slab->was_queued_for_scrubbing) {
+		WRITE_ONCE(scrubber->slab_count, scrubber->slab_count + 1);
+		slab->was_queued_for_scrubbing = true;
+	}
+
+	if (high_priority) {
+		slab->status = VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING;
+		list_add_tail(&slab->allocq_entry, &scrubber->high_priority_slabs);
+		return;
+	}
+
+	list_add_tail(&slab->allocq_entry, &scrubber->slabs);
+}
+
+/* Queue a slab for allocation or scrubbing. */
+static void queue_slab(struct vdo_slab *slab)
+{
+	struct block_allocator *allocator = slab->allocator;
+	block_count_t free_blocks;
+	int result;
+
+	ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
+			"a requeued slab must not already be on a ring");
+
+	if (vdo_is_read_only(allocator->depot->vdo))
+		return;
+
+	free_blocks = slab->free_blocks;
+	result = ASSERT((free_blocks <= allocator->depot->slab_config.data_blocks),
+			"rebuilt slab %u must have a valid free block count (has %llu, expected maximum %llu)",
+			slab->slab_number,
+			(unsigned long long) free_blocks,
+			(unsigned long long) allocator->depot->slab_config.data_blocks);
+	if (result != VDO_SUCCESS) {
+		vdo_enter_read_only_mode(allocator->depot->vdo, result);
+		return;
+	}
+
+	if (slab->status != VDO_SLAB_REBUILT) {
+		register_slab_for_scrubbing(slab, false);
+		return;
+	}
+
+	if (!vdo_is_state_resuming(&slab->state)) {
+		/*
+		 * If the slab is resuming, we've already accounted for it here, so don't do it
+		 * again.
+		 * FIXME: under what situation would the slab be resuming here?
+		 */
+		WRITE_ONCE(allocator->allocated_blocks, allocator->allocated_blocks - free_blocks);
+		if (!is_slab_journal_blank(slab))
+			WRITE_ONCE(allocator->statistics.slabs_opened,
+				   allocator->statistics.slabs_opened + 1);
+	}
+
+	if (allocator->depot->vdo->suspend_type == VDO_ADMIN_STATE_SAVING)
+		reopen_slab_journal(slab);
+
+	prioritize_slab(slab);
+}
+
+/**
+ * initiate_slab_action() - Initiate a slab action.
+ *
+ * Implements vdo_admin_initiator.
+ */
+static void initiate_slab_action(struct admin_state *state)
+{
+	struct vdo_slab *slab = container_of(state, struct vdo_slab, state);
+
+	if (vdo_is_state_draining(state)) {
+		const struct admin_state_code *operation = vdo_get_admin_state_code(state);
+
+		if (operation == VDO_ADMIN_STATE_SCRUBBING)
+			slab->status = VDO_SLAB_REBUILDING;
+
+		drain_slab(slab);
+		check_if_slab_drained(slab);
+		return;
+	}
+
+	if (vdo_is_state_loading(state)) {
+		load_slab_journal(slab);
+		return;
+	}
+
+	if (vdo_is_state_resuming(state)) {
+		queue_slab(slab);
+		vdo_finish_resuming(state);
+		return;
+	}
+
+	vdo_finish_operation(state, VDO_INVALID_ADMIN_STATE);
+}
+
+/**
+ * get_next_slab() - Get the next slab to scrub.
+ * @scrubber: The slab scrubber.
+ *
+ * Return: The next slab to scrub or NULL if there are none.
+ */
+static struct vdo_slab *get_next_slab(struct slab_scrubber *scrubber)
+{
+	struct vdo_slab *slab;
+
+	slab = list_first_entry_or_null(&scrubber->high_priority_slabs,
+					struct vdo_slab,
+					allocq_entry);
+	if (slab != NULL)
+		return slab;
+
+	return list_first_entry_or_null(&scrubber->slabs, struct vdo_slab, allocq_entry);
+}
+
+/**
+ * has_slabs_to_scrub() - Check whether a scrubber has slabs to scrub.
+ * @scrubber: The scrubber to check.
+ *
+ * Return: true if the scrubber has slabs to scrub.
+ */
+static bool __must_check has_slabs_to_scrub(struct slab_scrubber *scrubber)
+{
+	return (get_next_slab(scrubber) != NULL);
+}
+
+/**
+ * uninitialize_scrubber_vio() - Clean up the slab_scrubber's vio.
+ * @scrubber: The scrubber.
+ */
+static void uninitialize_scrubber_vio(struct slab_scrubber *scrubber)
+{
+	UDS_FREE(UDS_FORGET(scrubber->vio.data));
+	free_vio_components(&scrubber->vio);
+}
+
+/**
+ * finish_scrubbing() - Stop scrubbing, either because there are no more slabs to scrub or because
+ *                      there's been an error.
+ * @scrubber: The scrubber.
+ */
+static void finish_scrubbing(struct slab_scrubber *scrubber, int result)
+{
+	bool notify = vdo_has_waiters(&scrubber->waiters);
+	bool done = !has_slabs_to_scrub(scrubber);
+	struct block_allocator *allocator =
+		container_of(scrubber, struct block_allocator, scrubber);
+
+	if (done)
+		uninitialize_scrubber_vio(scrubber);
+
+	if (scrubber->high_priority_only) {
+		scrubber->high_priority_only = false;
+		vdo_fail_completion(UDS_FORGET(scrubber->vio.completion.parent), result);
+	} else if (done && (atomic_add_return(-1, &allocator->depot->zones_to_scrub) == 0)) {
+		/* All of our slabs were scrubbed, and we're the last allocator to finish. */
+		enum vdo_state prior_state =
+			atomic_cmpxchg(&allocator->depot->vdo->state, VDO_RECOVERING, VDO_DIRTY);
+
+		/*
+		 * To be safe, even if the CAS failed, ensure anything that follows is ordered with
+		 * respect to whatever state change did happen.
+		 */
+		smp_mb__after_atomic();
+
+		/*
+		 * We must check the VDO state here and not the depot's read_only_notifier since
+		 * the compare-swap-above could have failed due to a read-only entry which our own
+		 * thread does not yet know about.
+		 */
+		if (prior_state == VDO_DIRTY)
+			uds_log_info("VDO commencing normal operation");
+		else if (prior_state == VDO_RECOVERING)
+			uds_log_info("Exiting recovery mode");
+	}
+
+	/*
+	 * Note that the scrubber has stopped, and inform anyone who might be waiting for that to
+	 * happen.
+	 */
+	if (!vdo_finish_draining(&scrubber->admin_state))
+		WRITE_ONCE(scrubber->admin_state.current_state, VDO_ADMIN_STATE_SUSPENDED);
+
+	/*
+	 * We can't notify waiters until after we've finished draining or they'll just requeue.
+	 * Fortunately if there were waiters, we can't have been freed yet.
+	 */
+	if (notify)
+		vdo_notify_all_waiters(&scrubber->waiters, NULL, NULL);
+}
+
+static void scrub_next_slab(struct slab_scrubber *scrubber);
+
+/**
+ * slab_scrubbed() - Notify the scrubber that a slab has been scrubbed.
+ * @completion: The slab rebuild completion.
+ *
+ * This callback is registered in apply_journal_entries().
+ */
+static void slab_scrubbed(struct vdo_completion *completion)
+{
+	struct slab_scrubber *scrubber =
+		container_of(as_vio(completion), struct slab_scrubber, vio);
+	struct vdo_slab *slab = scrubber->slab;
+
+	slab->status = VDO_SLAB_REBUILT;
+	queue_slab(slab);
+	reopen_slab_journal(slab);
+	WRITE_ONCE(scrubber->slab_count, scrubber->slab_count - 1);
+	scrub_next_slab(scrubber);
+}
+
+/**
+ * abort_scrubbing() - Abort scrubbing due to an error.
+ * @scrubber: The slab scrubber.
+ * @result: The error.
+ */
+static void abort_scrubbing(struct slab_scrubber *scrubber, int result)
+{
+	vdo_enter_read_only_mode(scrubber->vio.completion.vdo, result);
+	finish_scrubbing(scrubber, result);
+}
+
+/**
+ * handle_scrubber_error() - Handle errors while rebuilding a slab.
+ * @completion: The slab rebuild completion.
+ */
+static void handle_scrubber_error(struct vdo_completion *completion)
+{
+	struct vio *vio = as_vio(completion);
+
+	vio_record_metadata_io_error(vio);
+	abort_scrubbing(container_of(vio, struct slab_scrubber, vio), completion->result);
+}
+
+/**
+ * apply_block_entries() - Apply all the entries in a block to the reference counts.
+ * @block: A block with entries to apply.
+ * @entry_count: The number of entries to apply.
+ * @block_number: The sequence number of the block.
+ * @slab: The slab to apply the entries to.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int apply_block_entries(struct packed_slab_journal_block *block,
+			       journal_entry_count_t entry_count,
+			       sequence_number_t block_number,
+			       struct vdo_slab *slab)
+{
+	struct journal_point entry_point = {
+		.sequence_number = block_number,
+		.entry_count = 0,
+	};
+	int result;
+	slab_block_number max_sbn = slab->end - slab->start;
+
+	while (entry_point.entry_count < entry_count) {
+		struct slab_journal_entry entry =
+			vdo_decode_slab_journal_entry(block, entry_point.entry_count);
+
+		if (entry.sbn > max_sbn)
+			/* This entry is out of bounds. */
+			return uds_log_error_strerror(VDO_CORRUPT_JOURNAL,
+						      "vdo_slab journal entry (%llu, %u) had invalid offset %u in slab (size %u blocks)",
+						      (unsigned long long) block_number,
+						      entry_point.entry_count,
+						      entry.sbn,
+						      max_sbn);
+
+		result = replay_reference_count_change(slab, &entry_point, entry);
+		if (result != VDO_SUCCESS) {
+			uds_log_error_strerror(result,
+					       "vdo_slab journal entry (%llu, %u) (%s of offset %u) could not be applied in slab %u",
+					       (unsigned long long) block_number,
+					       entry_point.entry_count,
+					       vdo_get_journal_operation_name(entry.operation),
+					       entry.sbn,
+					       slab->slab_number);
+			return result;
+		}
+		entry_point.entry_count++;
+	}
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * apply_journal_entries() - Find the relevant vio of the slab journal and apply all valid entries.
+ * @completion: The metadata read vio completion.
+ *
+ * This is a callback registered in start_scrubbing().
+ */
+static void apply_journal_entries(struct vdo_completion *completion)
+{
+	int result;
+	struct slab_scrubber *scrubber
+		= container_of(as_vio(completion), struct slab_scrubber, vio);
+	struct vdo_slab *slab = scrubber->slab;
+	struct slab_journal *journal = &slab->journal;
+
+	/* Find the boundaries of the useful part of the journal. */
+	sequence_number_t tail = journal->tail;
+	tail_block_offset_t end_index = (tail - 1) % journal->size;
+	char *end_data = scrubber->vio.data + (end_index * VDO_BLOCK_SIZE);
+	struct packed_slab_journal_block *end_block =
+		(struct packed_slab_journal_block *) end_data;
+
+	sequence_number_t head = __le64_to_cpu(end_block->header.head);
+	tail_block_offset_t head_index = head % journal->size;
+	block_count_t index = head_index;
+
+	struct journal_point ref_counts_point = slab->slab_journal_point;
+	struct journal_point last_entry_applied = ref_counts_point;
+	sequence_number_t sequence;
+
+	for (sequence = head; sequence < tail; sequence++) {
+		char *block_data = scrubber->vio.data + (index * VDO_BLOCK_SIZE);
+		struct packed_slab_journal_block *block =
+			(struct packed_slab_journal_block *) block_data;
+		struct slab_journal_block_header header;
+
+		vdo_unpack_slab_journal_block_header(&block->header, &header);
+
+		if ((header.nonce != slab->allocator->nonce) ||
+		    (header.metadata_type != VDO_METADATA_SLAB_JOURNAL) ||
+		    (header.sequence_number != sequence) ||
+		    (header.entry_count > journal->entries_per_block) ||
+		    (header.has_block_map_increments &&
+		     (header.entry_count > journal->full_entries_per_block))) {
+			/* The block is not what we expect it to be. */
+			uds_log_error("vdo_slab journal block for slab %u was invalid",
+				      slab->slab_number);
+			abort_scrubbing(scrubber, VDO_CORRUPT_JOURNAL);
+			return;
+		}
+
+		result = apply_block_entries(block, header.entry_count, sequence, slab);
+		if (result != VDO_SUCCESS) {
+			abort_scrubbing(scrubber, result);
+			return;
+		}
+
+		last_entry_applied.sequence_number = sequence;
+		last_entry_applied.entry_count = header.entry_count - 1;
+		index++;
+		if (index == journal->size)
+			index = 0;
+	}
+
+	/*
+	 * At the end of rebuild, the reference counters should be accurate to the end of the
+	 * journal we just applied.
+	 */
+	result = ASSERT(!vdo_before_journal_point(&last_entry_applied, &ref_counts_point),
+			"Refcounts are not more accurate than the slab journal");
+	if (result != VDO_SUCCESS) {
+		abort_scrubbing(scrubber, result);
+		return;
+	}
+
+	/* Save out the rebuilt reference blocks. */
+	vdo_prepare_completion(completion,
+			       slab_scrubbed,
+			       handle_scrubber_error,
+			       slab->allocator->thread_id,
+			       completion->parent);
+	vdo_start_operation_with_waiter(&slab->state,
+					VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING,
+					completion,
+					initiate_slab_action);
+}
+
+static void read_slab_journal_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct slab_scrubber *scrubber = container_of(vio, struct slab_scrubber, vio);
+
+	continue_vio_after_io(bio->bi_private,
+			      apply_journal_entries,
+			      scrubber->slab->allocator->thread_id);
+}
+
+/**
+ * start_scrubbing() - Read the current slab's journal from disk now that it has been flushed.
+ * @completion: The scrubber's vio completion.
+ *
+ * This callback is registered in scrub_next_slab().
+ */
+static void start_scrubbing(struct vdo_completion *completion)
+{
+	struct slab_scrubber *scrubber =
+		container_of(as_vio(completion), struct slab_scrubber, vio);
+	struct vdo_slab *slab = scrubber->slab;
+
+	if (!slab->allocator->summary_entries[slab->slab_number].is_dirty) {
+		slab_scrubbed(completion);
+		return;
+	}
+
+	submit_metadata_vio(&scrubber->vio,
+			    slab->journal_origin,
+			    read_slab_journal_endio,
+			    handle_scrubber_error,
+			    REQ_OP_READ);
+}
+
+/**
+ * scrub_next_slab() - Scrub the next slab if there is one.
+ * @scrubber: The scrubber.
+ */
+static void scrub_next_slab(struct slab_scrubber *scrubber)
+{
+	struct vdo_completion *completion = &scrubber->vio.completion;
+	struct vdo_slab *slab;
+
+	/*
+	 * Note: this notify call is always safe only because scrubbing can only be started when
+	 * the VDO is quiescent.
+	 */
+	vdo_notify_all_waiters(&scrubber->waiters, NULL, NULL);
+
+	if (vdo_is_read_only(completion->vdo)) {
+		finish_scrubbing(scrubber, VDO_READ_ONLY);
+		return;
+	}
+
+	slab = get_next_slab(scrubber);
+	if ((slab == NULL) ||
+	    (scrubber->high_priority_only && list_empty(&scrubber->high_priority_slabs))) {
+		finish_scrubbing(scrubber, VDO_SUCCESS);
+		return;
+	}
+
+	if (vdo_finish_draining(&scrubber->admin_state))
+		return;
+
+	list_del_init(&slab->allocq_entry);
+	scrubber->slab = slab;
+	vdo_prepare_completion(completion,
+			       start_scrubbing,
+			       handle_scrubber_error,
+			       slab->allocator->thread_id,
+			       completion->parent);
+	vdo_start_operation_with_waiter(&slab->state,
+					VDO_ADMIN_STATE_SCRUBBING,
+					completion,
+					initiate_slab_action);
+}
+
+/**
+ * scrub_slabs() - Scrub all of an allocator's slabs that are eligible for scrubbing.
+ * @allocator: The block_allocator to scrub.
+ * @parent: The completion to notify when scrubbing is done, implies high_priority, may be NULL.
+ */
+static void scrub_slabs(struct block_allocator *allocator, struct vdo_completion *parent)
+{
+	struct slab_scrubber *scrubber = &allocator->scrubber;
+
+	scrubber->vio.completion.parent = parent;
+	scrubber->high_priority_only = (parent != NULL);
+	if (!has_slabs_to_scrub(scrubber)) {
+		finish_scrubbing(scrubber, VDO_SUCCESS);
+		return;
+	}
+
+	if (scrubber->high_priority_only &&
+	    vdo_is_priority_table_empty(allocator->prioritized_slabs) &&
+	    list_empty(&scrubber->high_priority_slabs))
+		register_slab_for_scrubbing(get_next_slab(scrubber), true);
+
+	vdo_resume_if_quiescent(&scrubber->admin_state);
+	scrub_next_slab(scrubber);
+}
+
+static inline void assert_on_allocator_thread(thread_id_t thread_id, const char *function_name)
+{
+	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == thread_id),
+			"%s called on correct thread",
+			function_name);
+}
+
+static void register_slab_with_allocator(struct block_allocator *allocator, struct vdo_slab *slab)
+{
+	allocator->slab_count++;
+	allocator->last_slab = slab->slab_number;
+}
+
+/**
+ * get_depot_slab_iterator() - Return a slab_iterator over the slabs in a slab_depot.
+ * @depot: The depot over which to iterate.
+ * @start: The number of the slab to start iterating from.
+ * @end: The number of the last slab which may be returned.
+ * @stride: The difference in slab number between successive slabs.
+ *
+ * Iteration always occurs from higher to lower numbered slabs.
+ *
+ * Return: An initialized iterator structure.
+ */
+static struct slab_iterator get_depot_slab_iterator(struct slab_depot *depot,
+						    slab_count_t start,
+						    slab_count_t end,
+						    slab_count_t stride)
+{
+	struct vdo_slab **slabs = depot->slabs;
+
+	return (struct slab_iterator) {
+		.slabs = slabs,
+		.next = (((slabs == NULL) || (start < end)) ? NULL : slabs[start]),
+		.end = end,
+		.stride = stride,
+	};
+}
+
+static struct slab_iterator get_slab_iterator(const struct block_allocator *allocator)
+{
+	return get_depot_slab_iterator(allocator->depot,
+				       allocator->last_slab,
+				       allocator->zone_number,
+				       allocator->depot->zone_count);
+}
+
+/**
+ * next_slab() - Get the next slab from a slab_iterator and advance the iterator
+ * @iterator: The slab_iterator.
+ *
+ * Return: The next slab or NULL if the iterator is exhausted.
+ */
+static struct vdo_slab *next_slab(struct slab_iterator *iterator)
+{
+	struct vdo_slab *slab = iterator->next;
+
+	if ((slab == NULL) || (slab->slab_number < iterator->end + iterator->stride))
+		iterator->next = NULL;
+	else
+		iterator->next = iterator->slabs[slab->slab_number - iterator->stride];
+
+	return slab;
+}
+
+/**
+ * abort_waiter() - Abort vios waiting to make journal entries when read-only.
+ *
+ * This callback is invoked on all vios waiting to make slab journal entries after the VDO has gone
+ * into read-only mode. Implements waiter_callback.
+ */
+static void abort_waiter(struct waiter *waiter, void *context __always_unused)
+{
+	struct reference_updater *updater = container_of(waiter, struct reference_updater, waiter);
+	struct data_vio *data_vio = data_vio_from_reference_updater(updater);
+
+	if (updater->increment) {
+		continue_data_vio_with_error(data_vio, VDO_READ_ONLY);
+		return;
+	}
+
+	vdo_continue_completion(&data_vio->decrement_completion, VDO_READ_ONLY);
+}
+
+/* Implements vdo_read_only_notification. */
+static void notify_block_allocator_of_read_only_mode(void *listener, struct vdo_completion *parent)
+{
+	struct block_allocator *allocator = listener;
+	struct slab_iterator iterator;
+
+	assert_on_allocator_thread(allocator->thread_id, __func__);
+	iterator = get_slab_iterator(allocator);
+	while (iterator.next != NULL) {
+		struct vdo_slab *slab = next_slab(&iterator);
+
+		vdo_notify_all_waiters(&slab->journal.entry_waiters, abort_waiter, &slab->journal);
+		check_if_slab_drained(slab);
+	}
+
+	vdo_finish_completion(parent);
+}
+
+/**
+ * vdo_acquire_provisional_reference() - Acquire a provisional reference on behalf of a PBN lock if
+ *                                       the block it locks is unreferenced.
+ * @slab: The slab which contains the block.
+ * @pbn: The physical block to reference.
+ * @lock: The lock.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_acquire_provisional_reference(struct vdo_slab *slab,
+				      physical_block_number_t pbn,
+				      struct pbn_lock *lock)
+{
+	slab_block_number block_number;
+	int result;
+
+	if (vdo_pbn_lock_has_provisional_reference(lock))
+		return VDO_SUCCESS;
+
+	if (!is_slab_open(slab))
+		return VDO_INVALID_ADMIN_STATE;
+
+	result = slab_block_number_from_pbn(slab, pbn, &block_number);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	if (slab->counters[block_number] == EMPTY_REFERENCE_COUNT) {
+		make_provisional_reference(slab, block_number);
+		if (lock != NULL)
+			vdo_assign_pbn_lock_provisional_reference(lock);
+	}
+
+	if (vdo_pbn_lock_has_provisional_reference(lock))
+		adjust_free_block_count(slab, false);
+
+	return VDO_SUCCESS;
+}
+
+static int __must_check
+allocate_slab_block(struct vdo_slab *slab, physical_block_number_t *block_number_ptr)
+{
+	slab_block_number free_index;
+
+	if (!is_slab_open(slab))
+		return VDO_INVALID_ADMIN_STATE;
+
+	if (!search_reference_blocks(slab, &free_index))
+		return VDO_NO_SPACE;
+
+	ASSERT_LOG_ONLY((slab->counters[free_index] == EMPTY_REFERENCE_COUNT),
+			"free block must have ref count of zero");
+	make_provisional_reference(slab, free_index);
+	adjust_free_block_count(slab, false);
+
+	/*
+	 * Update the search hint so the next search will start at the array index just past the
+	 * free block we just found.
+	 */
+	slab->search_cursor.index = (free_index + 1);
+
+	*block_number_ptr = slab->start + free_index;
+	return VDO_SUCCESS;
+}
+
+/**
+ * open_slab() - Prepare a slab to be allocated from.
+ * @slab: The slab.
+ */
+static void open_slab(struct vdo_slab *slab)
+{
+	reset_search_cursor(slab);
+	if (is_slab_journal_blank(slab)) {
+		WRITE_ONCE(slab->allocator->statistics.slabs_opened,
+			   slab->allocator->statistics.slabs_opened + 1);
+		dirty_all_reference_blocks(slab);
+	} else {
+		WRITE_ONCE(slab->allocator->statistics.slabs_reopened,
+			   slab->allocator->statistics.slabs_reopened + 1);
+	}
+
+	slab->allocator->open_slab = slab;
+}
+
+
+/*
+ * The block allocated will have a provisional reference and the reference must be either confirmed
+ * with a subsequent increment or vacated with a subsequent decrement via
+ * vdo_release_block_reference().
+ */
+int vdo_allocate_block(struct block_allocator *allocator,
+		       physical_block_number_t *block_number_ptr)
+{
+	int result;
+
+	if (allocator->open_slab != NULL) {
+		/* Try to allocate the next block in the currently open slab. */
+		result = allocate_slab_block(allocator->open_slab, block_number_ptr);
+		if ((result == VDO_SUCCESS) || (result != VDO_NO_SPACE))
+			return result;
+
+		/* Put the exhausted open slab back into the priority table. */
+		prioritize_slab(allocator->open_slab);
+	}
+
+	/* Remove the highest priority slab from the priority table and make it the open slab. */
+	open_slab(list_entry(vdo_priority_table_dequeue(allocator->prioritized_slabs),
+			     struct vdo_slab,
+			     allocq_entry));
+
+	/*
+	 * Try allocating again. If we're out of space immediately after opening a slab, then every
+	 * slab must be fully allocated.
+	 */
+	return allocate_slab_block(allocator->open_slab, block_number_ptr);
+}
+
+/**
+ * vdo_enqueue_clean_slab_waiter() - Wait for a clean slab.
+ * @allocator: The block_allocator on which to wait.
+ * @waiter: The waiter.
+ *
+ * Return: VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no slabs to scrub, and
+ *         some other error otherwise.
+ */
+int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator, struct waiter *waiter)
+{
+	if (vdo_is_read_only(allocator->depot->vdo))
+		return VDO_READ_ONLY;
+
+	if (vdo_is_state_quiescent(&allocator->scrubber.admin_state))
+		return VDO_NO_SPACE;
+
+	vdo_enqueue_waiter(&allocator->scrubber.waiters, waiter);
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_modify_reference_count() - Modify the reference count of a block by first making a slab
+ *                                journal entry and then updating the reference counter.
+ *
+ * @data_vio: The data_vio for which to add the entry.
+ * @updater: Which of the data_vio's reference updaters is being submitted.
+ */
+void vdo_modify_reference_count(struct vdo_completion *completion,
+				struct reference_updater *updater)
+{
+	struct vdo_slab *slab = vdo_get_slab(completion->vdo->depot, updater->zpbn.pbn);
+
+	if (!is_slab_open(slab)) {
+		vdo_continue_completion(completion, VDO_INVALID_ADMIN_STATE);
+		return;
+	}
+
+	if (vdo_is_read_only(completion->vdo)) {
+		vdo_continue_completion(completion, VDO_READ_ONLY);
+		return;
+	}
+
+	vdo_enqueue_waiter(&slab->journal.entry_waiters, &updater->waiter);
+	if ((slab->status != VDO_SLAB_REBUILT) && requires_reaping(&slab->journal))
+		register_slab_for_scrubbing(slab, true);
+
+	add_entries(&slab->journal);
+}
+
+/* Release an unused provisional reference. */
+int vdo_release_block_reference(struct block_allocator *allocator, physical_block_number_t pbn)
+{
+	struct reference_updater updater;
+
+	if (pbn == VDO_ZERO_BLOCK)
+		return VDO_SUCCESS;
+
+	updater = (struct reference_updater) {
+		.operation = VDO_JOURNAL_DATA_REMAPPING,
+		.increment = false,
+		.zpbn = {
+			.pbn = pbn,
+		},
+	};
+
+	return adjust_reference_count(vdo_get_slab(allocator->depot, pbn), &updater, NULL);
+}
+
+/**
+ * This is a min_heap callback function orders slab_status structures using the 'is_clean' field as
+ * the primary key and the 'emptiness' field as the secondary key.
+ *
+ * Slabs need to be pushed onto the rings in the same order they are to be popped off. Popping
+ * should always get the most empty first, so pushing should be from most empty to least empty.
+ * Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements
+ * before larger ones.
+ */
+static bool slab_status_is_less_than(const void *item1, const void *item2)
+{
+	const struct slab_status *info1 = (const struct slab_status *) item1;
+	const struct slab_status *info2 = (const struct slab_status *) item2;
+
+	if (info1->is_clean != info2->is_clean)
+		return info1->is_clean;
+	if (info1->emptiness != info2->emptiness)
+		return info1->emptiness > info2->emptiness;
+	return info1->slab_number < info2->slab_number;
+}
+
+static void swap_slab_statuses(void *item1, void *item2)
+{
+	struct slab_status *info1 = item1;
+	struct slab_status *info2 = item2;
+
+	swap(*info1, *info2);
+}
+
+static const struct min_heap_callbacks slab_status_min_heap = {
+	.elem_size = sizeof(struct slab_status),
+	.less = slab_status_is_less_than,
+	.swp = swap_slab_statuses,
+};
+
+/* Inform the slab actor that a action has finished on some slab; used by apply_to_slabs(). */
+static void slab_action_callback(struct vdo_completion *completion)
+{
+	struct block_allocator *allocator = vdo_as_block_allocator(completion);
+	struct slab_actor *actor = &allocator->slab_actor;
+
+	if (--actor->slab_action_count == 0) {
+		actor->callback(completion);
+		return;
+	}
+
+	vdo_reset_completion(completion);
+}
+
+/* Preserve the error from part of an action and continue. */
+static void handle_operation_error(struct vdo_completion *completion)
+{
+	struct block_allocator *allocator = vdo_as_block_allocator(completion);
+
+	if (allocator->state.waiter != NULL)
+		vdo_set_completion_result(allocator->state.waiter, completion->result);
+	completion->callback(completion);
+}
+
+/* Perform an action on each of an allocator's slabs in parallel. */
+static void apply_to_slabs(struct block_allocator *allocator, vdo_action *callback)
+{
+	struct slab_iterator iterator;
+
+	vdo_prepare_completion(&allocator->completion,
+			       slab_action_callback,
+			       handle_operation_error,
+			       allocator->thread_id,
+			       NULL);
+	allocator->completion.requeue = false;
+
+	/*
+	 * Since we are going to dequeue all of the slabs, the open slab will become invalid, so
+	 * clear it.
+	 */
+	allocator->open_slab = NULL;
+
+	/* Ensure that we don't finish before we're done starting. */
+	allocator->slab_actor = (struct slab_actor) {
+		.slab_action_count = 1,
+		.callback = callback,
+	};
+
+	iterator = get_slab_iterator(allocator);
+	while (iterator.next != NULL) {
+		const struct admin_state_code *operation =
+			vdo_get_admin_state_code(&allocator->state);
+		struct vdo_slab *slab = next_slab(&iterator);
+
+		list_del_init(&slab->allocq_entry);
+		allocator->slab_actor.slab_action_count++;
+		vdo_start_operation_with_waiter(&slab->state,
+						operation,
+						&allocator->completion,
+						initiate_slab_action);
+	}
+
+	slab_action_callback(&allocator->completion);
+}
+
+static void finish_loading_allocator(struct vdo_completion *completion)
+{
+	struct block_allocator *allocator = vdo_as_block_allocator(completion);
+	const struct admin_state_code *operation = vdo_get_admin_state_code(&allocator->state);
+
+	if (allocator->eraser != NULL)
+		dm_kcopyd_client_destroy(UDS_FORGET(allocator->eraser));
+
+	if (operation == VDO_ADMIN_STATE_LOADING_FOR_RECOVERY) {
+		void *context = vdo_get_current_action_context(allocator->depot->action_manager);
+
+		vdo_replay_into_slab_journals(allocator, context);
+		return;
+	}
+
+	vdo_finish_loading(&allocator->state);
+}
+
+static void erase_next_slab_journal(struct block_allocator *allocator);
+
+static void copy_callback(int read_err, unsigned long write_err, void *context)
+{
+	struct block_allocator *allocator = context;
+	int result = (((read_err == 0) && (write_err == 0)) ? VDO_SUCCESS : -EIO);
+
+	if (result != VDO_SUCCESS) {
+		vdo_fail_completion(&allocator->completion, result);
+		return;
+	}
+
+	erase_next_slab_journal(allocator);
+}
+
+/* erase_next_slab_journal() - Erase the next slab journal. */
+static void erase_next_slab_journal(struct block_allocator *allocator)
+{
+	struct vdo_slab *slab;
+	physical_block_number_t pbn;
+	struct dm_io_region regions[1];
+	struct slab_depot *depot = allocator->depot;
+	block_count_t blocks = depot->slab_config.slab_journal_blocks;
+
+	if (allocator->slabs_to_erase.next == NULL) {
+		vdo_finish_completion(&allocator->completion);
+		return;
+	}
+
+	slab = next_slab(&allocator->slabs_to_erase);
+	pbn = slab->journal_origin - depot->vdo->geometry.bio_offset;
+	regions[0] = (struct dm_io_region) {
+		.bdev = vdo_get_backing_device(depot->vdo),
+		.sector = pbn * VDO_SECTORS_PER_BLOCK,
+		.count = blocks * VDO_SECTORS_PER_BLOCK,
+	};
+	dm_kcopyd_zero(allocator->eraser, 1, regions, 0, copy_callback, allocator);
+}
+
+/* Implements vdo_admin_initiator. */
+static void initiate_load(struct admin_state *state)
+{
+	struct block_allocator *allocator = container_of(state, struct block_allocator, state);
+	const struct admin_state_code *operation = vdo_get_admin_state_code(state);
+
+	if (operation == VDO_ADMIN_STATE_LOADING_FOR_REBUILD) {
+		/*
+		 * Must requeue because the kcopyd client cannot be freed in the same stack frame
+		 * as the kcopyd callback, lest it deadlock.
+		 */
+		vdo_prepare_completion_for_requeue(&allocator->completion,
+						   finish_loading_allocator,
+						   handle_operation_error,
+						   allocator->thread_id,
+						   NULL);
+		allocator->eraser = dm_kcopyd_client_create(NULL);
+		if (allocator->eraser == NULL) {
+			vdo_fail_completion(&allocator->completion, -ENOMEM);
+			return;
+		}
+		allocator->slabs_to_erase = get_slab_iterator(allocator);
+
+		erase_next_slab_journal(allocator);
+		return;
+	}
+
+	apply_to_slabs(allocator, finish_loading_allocator);
+}
+
+/*
+ * vdo_notify_slab_journals_are_recovered(): Inform a block allocator that its slab journals have
+ *                                           been recovered from the recovery journal.
+ * @completion The allocator completion
+ */
+void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion)
+{
+	struct block_allocator *allocator = vdo_as_block_allocator(completion);
+
+	vdo_finish_loading_with_result(&allocator->state, completion->result);
+}
+
+static int
+get_slab_statuses(struct block_allocator *allocator, struct slab_status **statuses_ptr)
+{
+	int result;
+	struct slab_status *statuses;
+	struct slab_iterator iterator = get_slab_iterator(allocator);
+
+	result = UDS_ALLOCATE(allocator->slab_count, struct slab_status, __func__, &statuses);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	*statuses_ptr = statuses;
+
+	while (iterator.next != NULL)  {
+		slab_count_t slab_number = next_slab(&iterator)->slab_number;
+
+		*statuses++ = (struct slab_status) {
+			.slab_number = slab_number,
+			.is_clean = !allocator->summary_entries[slab_number].is_dirty,
+			.emptiness = allocator->summary_entries[slab_number].fullness_hint,
+		};
+	}
+
+	return VDO_SUCCESS;
+}
+
+/* Prepare slabs for allocation or scrubbing. */
+static int __must_check
+vdo_prepare_slabs_for_allocation(struct block_allocator *allocator)
+{
+	struct slab_status current_slab_status;
+	struct min_heap heap;
+	int result;
+	struct slab_status *slab_statuses;
+	struct slab_depot *depot = allocator->depot;
+
+	WRITE_ONCE(allocator->allocated_blocks,
+		   allocator->slab_count * depot->slab_config.data_blocks);
+	result = get_slab_statuses(allocator, &slab_statuses);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	/* Sort the slabs by cleanliness, then by emptiness hint. */
+	heap = (struct min_heap) {
+		.data = slab_statuses,
+		.nr = allocator->slab_count,
+		.size = allocator->slab_count,
+	};
+	min_heapify_all(&heap, &slab_status_min_heap);
+
+	while (heap.nr > 0) {
+		bool high_priority;
+		struct vdo_slab *slab;
+		struct slab_journal *journal;
+
+		current_slab_status = slab_statuses[0];
+		min_heap_pop(&heap, &slab_status_min_heap);
+		slab = depot->slabs[current_slab_status.slab_number];
+
+		if ((depot->load_type == VDO_SLAB_DEPOT_REBUILD_LOAD) ||
+		    (!allocator->summary_entries[slab->slab_number].load_ref_counts &&
+		     current_slab_status.is_clean)) {
+			queue_slab(slab);
+			continue;
+		}
+
+		slab->status = VDO_SLAB_REQUIRES_SCRUBBING;
+		journal = &slab->journal;
+		high_priority = ((current_slab_status.is_clean &&
+				 (depot->load_type == VDO_SLAB_DEPOT_NORMAL_LOAD)) ||
+				 (journal_length(journal) >= journal->scrubbing_threshold));
+		register_slab_for_scrubbing(slab, high_priority);
+	}
+
+	UDS_FREE(slab_statuses);
+	return VDO_SUCCESS;
+}
+
+static const char *status_to_string(enum slab_rebuild_status status)
+{
+	switch (status) {
+	case VDO_SLAB_REBUILT:
+		return "REBUILT";
+	case VDO_SLAB_REQUIRES_SCRUBBING:
+		return "SCRUBBING";
+	case VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING:
+		return "PRIORITY_SCRUBBING";
+	case VDO_SLAB_REBUILDING:
+		return "REBUILDING";
+	case VDO_SLAB_REPLAYING:
+		return "REPLAYING";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+void vdo_dump_block_allocator(const struct block_allocator *allocator)
+{
+	unsigned int pause_counter = 0;
+	struct slab_iterator iterator = get_slab_iterator(allocator);
+	const struct slab_scrubber *scrubber = &allocator->scrubber;
+
+	uds_log_info("block_allocator zone %u", allocator->zone_number);
+	while (iterator.next != NULL) {
+		struct vdo_slab *slab = next_slab(&iterator);
+		struct slab_journal *journal = &slab->journal;
+
+		if (slab->reference_blocks != NULL)
+			/* Terse because there are a lot of slabs to dump and syslog is lossy. */
+			uds_log_info("slab %u: P%u, %llu free",
+				     slab->slab_number,
+				     slab->priority,
+				     (unsigned long long) slab->free_blocks);
+		else
+			uds_log_info("slab %u: status %s",
+				     slab->slab_number,
+				     status_to_string(slab->status));
+
+		uds_log_info("  slab journal: entry_waiters=%zu waiting_to_commit=%s updating_slab_summary=%s head=%llu unreapable=%llu tail=%llu next_commit=%llu summarized=%llu last_summarized=%llu recovery_lock=%llu dirty=%s",
+			     vdo_count_waiters(&journal->entry_waiters),
+			     uds_bool_to_string(journal->waiting_to_commit),
+			     uds_bool_to_string(journal->updating_slab_summary),
+			     (unsigned long long) journal->head,
+			     (unsigned long long) journal->unreapable,
+			     (unsigned long long) journal->tail,
+			     (unsigned long long) journal->next_commit,
+			     (unsigned long long) journal->summarized,
+			     (unsigned long long) journal->last_summarized,
+			     (unsigned long long) journal->recovery_lock,
+			     uds_bool_to_string(journal->recovery_lock != 0));
+		/*
+		 * Given the frequency with which the locks are just a tiny bit off, it might be
+		 * worth dumping all the locks, but that might be too much logging.
+		 */
+
+		if (slab->counters != NULL)
+			/* Terse because there are a lot of slabs to dump and syslog is lossy. */
+			uds_log_info("  slab: free=%u/%u blocks=%u dirty=%zu active=%zu journal@(%llu,%u)",
+				     slab->free_blocks,
+				     slab->block_count,
+				     slab->reference_block_count,
+				     vdo_count_waiters(&slab->dirty_blocks),
+				     slab->active_count,
+				     (unsigned long long) slab->slab_journal_point.sequence_number,
+				     slab->slab_journal_point.entry_count);
+		else
+			uds_log_info("  no counters");
+
+		/*
+		 * Wait for a while after each batch of 32 slabs dumped, an arbitrary number,
+		 * allowing the kernel log a chance to be flushed instead of being overrun.
+		 */
+		if (pause_counter++ == 31) {
+			pause_counter = 0;
+			uds_pause_for_logger();
+		}
+	}
+
+	uds_log_info("slab_scrubber slab_count %u waiters %zu %s%s",
+		     READ_ONCE(scrubber->slab_count),
+		     vdo_count_waiters(&scrubber->waiters),
+		     vdo_get_admin_state_code(&scrubber->admin_state)->name,
+		     scrubber->high_priority_only ? ", high_priority_only " : "");
+}
+
+static void free_slab(struct vdo_slab *slab)
+{
+	if (slab == NULL)
+		return;
+
+	list_del(&slab->allocq_entry);
+	UDS_FREE(UDS_FORGET(slab->journal.block));
+	UDS_FREE(UDS_FORGET(slab->journal.locks));
+	UDS_FREE(UDS_FORGET(slab->counters));
+	UDS_FREE(UDS_FORGET(slab->reference_blocks));
+	UDS_FREE(slab);
+}
+
+static int initialize_slab_journal(struct vdo_slab *slab)
+{
+	struct slab_journal *journal = &slab->journal;
+	const struct slab_config *slab_config = &slab->allocator->depot->slab_config;
+	int result;
+
+	result = UDS_ALLOCATE(slab_config->slab_journal_blocks,
+			      struct journal_lock,
+			      __func__,
+			      &journal->locks);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = UDS_ALLOCATE(VDO_BLOCK_SIZE,
+			      char,
+			      "struct packed_slab_journal_block",
+			      (char **) &journal->block);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	journal->slab = slab;
+	journal->size = slab_config->slab_journal_blocks;
+	journal->flushing_threshold = slab_config->slab_journal_flushing_threshold;
+	journal->blocking_threshold = slab_config->slab_journal_blocking_threshold;
+	journal->scrubbing_threshold = slab_config->slab_journal_scrubbing_threshold;
+	journal->entries_per_block = VDO_SLAB_JOURNAL_ENTRIES_PER_BLOCK;
+	journal->full_entries_per_block = VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK;
+	journal->events = &slab->allocator->slab_journal_statistics;
+	journal->recovery_journal = slab->allocator->depot->vdo->recovery_journal;
+	journal->tail = 1;
+	journal->head = 1;
+
+	journal->flushing_deadline = journal->flushing_threshold;
+	/*
+	 * Set there to be some time between the deadline and the blocking threshold, so that
+	 * hopefully all are done before blocking.
+	 */
+	if ((journal->blocking_threshold - journal->flushing_threshold) > 5)
+		journal->flushing_deadline = journal->blocking_threshold - 5;
+
+	journal->slab_summary_waiter.callback = release_journal_locks;
+
+	INIT_LIST_HEAD(&journal->dirty_entry);
+	INIT_LIST_HEAD(&journal->uncommitted_blocks);
+
+	journal->tail_header.nonce = slab->allocator->nonce;
+	journal->tail_header.metadata_type = VDO_METADATA_SLAB_JOURNAL;
+	initialize_journal_state(journal);
+	return VDO_SUCCESS;
+}
+
+/**
+ * make_slab() - Construct a new, empty slab.
+ * @slab_origin: The physical block number within the block allocator partition of the first block
+ *               in the slab.
+ * @allocator: The block allocator to which the slab belongs.
+ * @slab_number: The slab number of the slab.
+ * @is_new: true if this slab is being allocated as part of a resize.
+ * @slab_ptr: A pointer to receive the new slab.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int __must_check
+make_slab(physical_block_number_t slab_origin,
+	  struct block_allocator *allocator,
+	  slab_count_t slab_number,
+	  bool is_new,
+	  struct vdo_slab **slab_ptr)
+{
+	const struct slab_config *slab_config = &allocator->depot->slab_config;
+	struct vdo_slab *slab;
+	int result;
+
+	result = UDS_ALLOCATE(1, struct vdo_slab, __func__, &slab);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	*slab = (struct vdo_slab) {
+		.allocator = allocator,
+		.start = slab_origin,
+		.end = slab_origin + slab_config->slab_blocks,
+		.slab_number = slab_number,
+		.ref_counts_origin = slab_origin + slab_config->data_blocks,
+		.journal_origin = vdo_get_slab_journal_start_block(slab_config, slab_origin),
+		.block_count = slab_config->data_blocks,
+		.free_blocks = slab_config->data_blocks,
+		.reference_block_count =
+			vdo_get_saved_reference_count_size(slab_config->data_blocks),
+	};
+	INIT_LIST_HEAD(&slab->allocq_entry);
+
+	result = initialize_slab_journal(slab);
+	if (result != VDO_SUCCESS) {
+		free_slab(slab);
+		return result;
+	}
+
+	if (is_new) {
+		vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NEW);
+		result = allocate_slab_counters(slab);
+		if (result != VDO_SUCCESS) {
+			free_slab(slab);
+			return result;
+		}
+	} else {
+		vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+	}
+
+	*slab_ptr = slab;
+	return VDO_SUCCESS;
+}
+
+/**
+ * allocate_slabs() - Allocate a new slab pointer array.
+ * @depot: The depot.
+ * @slab_count: The number of slabs the depot should have in the new array.
+ *
+ * Any existing slab pointers will be copied into the new array, and slabs will be allocated as
+ * needed. The newly allocated slabs will not be distributed for use by the block allocators.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int allocate_slabs(struct slab_depot *depot, slab_count_t slab_count)
+{
+	block_count_t slab_size;
+	bool resizing = false;
+	physical_block_number_t slab_origin;
+	int result;
+
+	result = UDS_ALLOCATE(slab_count,
+			      struct vdo_slab *,
+			      "slab pointer array",
+			      &depot->new_slabs);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	if (depot->slabs != NULL) {
+		memcpy(depot->new_slabs,
+		       depot->slabs,
+		       depot->slab_count * sizeof(struct vdo_slab *));
+		resizing = true;
+	}
+
+	slab_size = depot->slab_config.slab_blocks;
+	slab_origin = depot->first_block + (depot->slab_count * slab_size);
+
+	for (depot->new_slab_count = depot->slab_count;
+	     depot->new_slab_count < slab_count;
+	     depot->new_slab_count++, slab_origin += slab_size) {
+		struct block_allocator *allocator =
+			&depot->allocators[depot->new_slab_count % depot->zone_count];
+		struct vdo_slab **slab_ptr = &depot->new_slabs[depot->new_slab_count];
+
+		result = make_slab(slab_origin,
+				   allocator,
+				   depot->new_slab_count,
+				   resizing,
+				   slab_ptr);
+		if (result != VDO_SUCCESS)
+			return result;
+	}
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_abandon_new_slabs() - Abandon any new slabs in this depot, freeing them as needed.
+ * @depot: The depot.
+ */
+void vdo_abandon_new_slabs(struct slab_depot *depot)
+{
+	slab_count_t i;
+
+	if (depot->new_slabs == NULL)
+		return;
+
+	for (i = depot->slab_count; i < depot->new_slab_count; i++)
+		free_slab(UDS_FORGET(depot->new_slabs[i]));
+	depot->new_slab_count = 0;
+	depot->new_size = 0;
+	UDS_FREE(UDS_FORGET(depot->new_slabs));
+}
+
+/**
+ * get_allocator_thread_id() - Get the ID of the thread on which a given allocator operates.
+ *
+ * Implements vdo_zone_thread_getter.
+ */
+static thread_id_t get_allocator_thread_id(void *context, zone_count_t zone_number)
+{
+	return ((struct slab_depot *) context)->allocators[zone_number].thread_id;
+}
+
+/**
+ * release_recovery_journal_lock() - Request the slab journal to release the recovery journal lock
+ *                                   it may hold on a specified recovery journal block.
+ * @journal: The slab journal.
+ * @recovery_lock: The sequence number of the recovery journal block whose locks should be
+ *                 released.
+ *
+ * Return: true if the journal does hold a lock on the specified block (which it will release).
+ */
+static bool __must_check
+release_recovery_journal_lock(struct slab_journal *journal, sequence_number_t recovery_lock)
+{
+	if (recovery_lock > journal->recovery_lock) {
+		ASSERT_LOG_ONLY((recovery_lock < journal->recovery_lock),
+				"slab journal recovery lock is not older than the recovery journal head");
+		return false;
+	}
+
+	if ((recovery_lock < journal->recovery_lock) ||
+	    vdo_is_read_only(journal->slab->allocator->depot->vdo))
+		return false;
+
+	/* All locks are held by the block which is in progress; write it. */
+	commit_tail(journal);
+	return true;
+}
+
+/*
+ * Request a commit of all dirty tail blocks which are locking the recovery journal block the depot
+ * is seeking to release.
+ *
+ * Implements vdo_zone_action.
+ */
+static void release_tail_block_locks(void *context,
+				     zone_count_t zone_number,
+				     struct vdo_completion *parent)
+{
+	struct slab_journal *journal, *tmp;
+	struct slab_depot *depot = context;
+	struct list_head *list = &depot->allocators[zone_number].dirty_slab_journals;
+
+	list_for_each_entry_safe(journal, tmp, list, dirty_entry) {
+		if (!release_recovery_journal_lock(journal, depot->active_release_request))
+			break;
+	}
+
+	vdo_finish_completion(parent);
+}
+
+/**
+ * prepare_for_tail_block_commit() - Prepare to commit oldest tail blocks.
+ *
+ * Implements vdo_action_preamble.
+ */
+static void prepare_for_tail_block_commit(void *context, struct vdo_completion *parent)
+{
+	struct slab_depot *depot = context;
+
+	depot->active_release_request = depot->new_release_request;
+	vdo_finish_completion(parent);
+}
+
+/**
+ * schedule_tail_block_commit() - Schedule a tail block commit if necessary.
+ *
+ * This method should not be called directly. Rather, call vdo_schedule_default_action() on the
+ * depot's action manager.
+ *
+ * Implements vdo_action_scheduler.
+ */
+static bool schedule_tail_block_commit(void *context)
+{
+	struct slab_depot *depot = context;
+
+	if (depot->new_release_request == depot->active_release_request)
+		return false;
+
+	return vdo_schedule_action(depot->action_manager,
+				   prepare_for_tail_block_commit,
+				   release_tail_block_locks,
+				   NULL,
+				   NULL);
+}
+
+/**
+ * initialize_slab_scrubber() - Initialize an allocator's slab scrubber.
+ * @allocator: The allocator being initialized
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int initialize_slab_scrubber(struct block_allocator *allocator)
+{
+	struct slab_scrubber *scrubber = &allocator->scrubber;
+	block_count_t slab_journal_size = allocator->depot->slab_config.slab_journal_blocks;
+	char *journal_data;
+	int result;
+
+	result = UDS_ALLOCATE(VDO_BLOCK_SIZE * slab_journal_size, char, __func__, &journal_data);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = allocate_vio_components(allocator->completion.vdo,
+					 VIO_TYPE_SLAB_JOURNAL,
+					 VIO_PRIORITY_METADATA,
+					 allocator,
+					 slab_journal_size,
+					 journal_data,
+					 &scrubber->vio);
+	if (result != VDO_SUCCESS) {
+		UDS_FREE(journal_data);
+		return result;
+	}
+
+	INIT_LIST_HEAD(&scrubber->high_priority_slabs);
+	INIT_LIST_HEAD(&scrubber->slabs);
+	vdo_set_admin_state_code(&scrubber->admin_state, VDO_ADMIN_STATE_SUSPENDED);
+	return VDO_SUCCESS;
+}
+
+/**
+ * initialize_slab_summary_block() - Initialize a slab_summary_block.
+ * @allocator: The allocator which owns the block.
+ * @index: The index of this block in its zone's summary.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int __must_check
+initialize_slab_summary_block(struct block_allocator *allocator, block_count_t index)
+{
+	struct slab_summary_block *block = &allocator->summary_blocks[index];
+	int result;
+
+	result = UDS_ALLOCATE(VDO_BLOCK_SIZE, char, __func__, &block->outgoing_entries);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = allocate_vio_components(allocator->depot->vdo,
+					 VIO_TYPE_SLAB_SUMMARY,
+					 VIO_PRIORITY_METADATA,
+					 NULL,
+					 1,
+					 block->outgoing_entries,
+					 &block->vio);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	block->allocator = allocator;
+	block->entries = &allocator->summary_entries[VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK * index];
+	block->index = index;
+	return VDO_SUCCESS;
+}
+
+static int __must_check initialize_block_allocator(struct slab_depot *depot, zone_count_t zone)
+{
+	int result;
+	block_count_t i;
+	struct block_allocator *allocator = &depot->allocators[zone];
+	struct vdo *vdo = depot->vdo;
+	block_count_t max_free_blocks = depot->slab_config.data_blocks;
+	unsigned int max_priority = (2 + ilog2(max_free_blocks));
+
+	*allocator = (struct block_allocator) {
+		.depot = depot,
+		.zone_number = zone,
+		.thread_id = vdo->thread_config.physical_threads[zone],
+		.nonce = vdo->states.vdo.nonce,
+	};
+
+	INIT_LIST_HEAD(&allocator->dirty_slab_journals);
+	vdo_set_admin_state_code(&allocator->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+	result = vdo_register_read_only_listener(vdo,
+						 allocator,
+						 notify_block_allocator_of_read_only_mode,
+						 allocator->thread_id);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION);
+	result = make_vio_pool(vdo,
+			       BLOCK_ALLOCATOR_VIO_POOL_SIZE,
+			       allocator->thread_id,
+			       VIO_TYPE_SLAB_JOURNAL,
+			       VIO_PRIORITY_METADATA,
+			       allocator,
+			       &allocator->vio_pool);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = initialize_slab_scrubber(allocator);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = vdo_make_priority_table(max_priority, &allocator->prioritized_slabs);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = UDS_ALLOCATE(VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE,
+			      struct slab_summary_block,
+			      __func__,
+			      &allocator->summary_blocks);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	vdo_set_admin_state_code(&allocator->summary_state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+	allocator->summary_entries = depot->summary_entries + (MAX_VDO_SLABS * zone);
+
+	/* Initialize each summary block. */
+	for (i = 0; i < VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE; i++) {
+		result = initialize_slab_summary_block(allocator, i);
+		if (result != VDO_SUCCESS)
+			return result;
+	}
+
+	/*
+	 * Performing well atop thin provisioned storage requires either that VDO discards freed
+	 * blocks, or that the block allocator try to use slabs that already have allocated blocks
+	 * in preference to slabs that have never been opened. For reasons we have not been able to
+	 * fully understand, some SSD machines have been have been very sensitive (50% reduction in
+	 * test throughput) to very slight differences in the timing and locality of block
+	 * allocation. Assigning a low priority to unopened slabs (max_priority/2, say) would be
+	 * ideal for the story, but anything less than a very high threshold (max_priority - 1)
+	 * hurts on these machines.
+	 *
+	 * This sets the free block threshold for preferring to open an unopened slab to the binary
+	 * floor of 3/4ths the total number of data blocks in a slab, which will generally evaluate
+	 * to about half the slab size.
+	 */
+	allocator->unopened_slab_priority = (1 + ilog2((max_free_blocks * 3) / 4));
+
+	return VDO_SUCCESS;
+}
+
+static int allocate_components(struct slab_depot *depot,
+			       struct partition *summary_partition)
+{
+	int result;
+	zone_count_t zone;
+	slab_count_t slab_count;
+	u8 hint;
+	u32 i;
+	const struct thread_config *thread_config = &depot->vdo->thread_config;
+
+	result = vdo_make_action_manager(depot->zone_count,
+					 get_allocator_thread_id,
+					 thread_config->journal_thread,
+					 depot,
+					 schedule_tail_block_commit,
+					 depot->vdo,
+					 &depot->action_manager);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	depot->origin = depot->first_block;
+
+	/* block size must be a multiple of entry size */
+	STATIC_ASSERT((VDO_BLOCK_SIZE % sizeof(struct slab_summary_entry)) == 0);
+
+	depot->summary_origin = summary_partition->offset;
+	depot->hint_shift = vdo_get_slab_summary_hint_shift(depot->slab_size_shift);
+	result = UDS_ALLOCATE(MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES,
+			      struct slab_summary_entry,
+			      __func__,
+			      &depot->summary_entries);
+	if (result != VDO_SUCCESS)
+		return result;
+
+
+	/* Initialize all the entries. */
+	hint = compute_fullness_hint(depot, depot->slab_config.data_blocks);
+	for (i = 0; i < MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES; i++) {
+		/*
+		 * This default tail block offset must be reflected in
+		 * slabJournal.c::read_slab_journal_tail().
+		 */
+		depot->summary_entries[i] = (struct slab_summary_entry) {
+			.tail_block_offset = 0,
+			.fullness_hint = hint,
+			.load_ref_counts = false,
+			.is_dirty = false,
+		};
+	}
+
+	if (result != VDO_SUCCESS)
+		return result;
+
+	slab_count = vdo_compute_slab_count(depot->first_block,
+					    depot->last_block,
+					    depot->slab_size_shift);
+	if (thread_config->physical_zone_count > slab_count)
+		return uds_log_error_strerror(VDO_BAD_CONFIGURATION,
+					      "%u physical zones exceeds slab count %u",
+					      thread_config->physical_zone_count,
+					      slab_count);
+
+	/* Initialize the block allocators. */
+	for (zone = 0; zone < depot->zone_count; zone++) {
+		result = initialize_block_allocator(depot, zone);
+		if (result != VDO_SUCCESS)
+			return result;
+	}
+
+	/* Allocate slabs. */
+	result = allocate_slabs(depot, slab_count);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	/* Use the new slabs. */
+	for (i = depot->slab_count; i < depot->new_slab_count; i++) {
+		struct vdo_slab *slab = depot->new_slabs[i];
+
+		register_slab_with_allocator(slab->allocator, slab);
+		WRITE_ONCE(depot->slab_count, depot->slab_count + 1);
+	}
+
+	depot->slabs = depot->new_slabs;
+	depot->new_slabs = NULL;
+	depot->new_slab_count = 0;
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_decode_slab_depot() - Make a slab depot and configure it with the state read from the super
+ *                           block.
+ * @state: The slab depot state from the super block.
+ * @vdo: The VDO which will own the depot.
+ * @summary_partition: The partition which holds the slab summary.
+ * @depot_ptr: A pointer to hold the depot.
+ *
+ * Return: A success or error code.
+ */
+int vdo_decode_slab_depot(struct slab_depot_state_2_0 state,
+			  struct vdo *vdo,
+			  struct partition *summary_partition,
+			  struct slab_depot **depot_ptr)
+{
+	unsigned int slab_size_shift;
+	struct slab_depot *depot;
+	int result;
+
+	/*
+	 * Calculate the bit shift for efficiently mapping block numbers to slabs. Using a shift
+	 * requires that the slab size be a power of two.
+	 */
+	block_count_t slab_size = state.slab_config.slab_blocks;
+
+	if (!is_power_of_2(slab_size))
+		return uds_log_error_strerror(UDS_INVALID_ARGUMENT,
+					      "slab size must be a power of two");
+	slab_size_shift = ilog2(slab_size);
+
+	result = UDS_ALLOCATE_EXTENDED(struct slab_depot,
+				       vdo->thread_config.physical_zone_count,
+				       struct block_allocator,
+				       __func__,
+				       &depot);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	depot->vdo = vdo;
+	depot->old_zone_count = state.zone_count;
+	depot->zone_count = vdo->thread_config.physical_zone_count;
+	depot->slab_config = state.slab_config;
+	depot->first_block = state.first_block;
+	depot->last_block = state.last_block;
+	depot->slab_size_shift = slab_size_shift;
+
+	result = allocate_components(depot, summary_partition);
+	if (result != VDO_SUCCESS) {
+		vdo_free_slab_depot(depot);
+		return result;
+	}
+
+	*depot_ptr = depot;
+	return VDO_SUCCESS;
+}
+
+static void uninitialize_allocator_summary(struct block_allocator *allocator)
+{
+	block_count_t i;
+
+	if (allocator->summary_blocks == NULL)
+		return;
+
+	for (i = 0; i < VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE; i++) {
+		free_vio_components(&allocator->summary_blocks[i].vio);
+		UDS_FREE(UDS_FORGET(allocator->summary_blocks[i].outgoing_entries));
+	}
+
+	UDS_FREE(UDS_FORGET(allocator->summary_blocks));
+}
+
+/**
+ * vdo_free_slab_depot() - Destroy a slab depot.
+ * @depot: The depot to destroy.
+ */
+void vdo_free_slab_depot(struct slab_depot *depot)
+{
+	zone_count_t zone = 0;
+
+	if (depot == NULL)
+		return;
+
+	vdo_abandon_new_slabs(depot);
+
+	for (zone = 0; zone < depot->zone_count; zone++) {
+		struct block_allocator *allocator = &depot->allocators[zone];
+
+		if (allocator->eraser != NULL)
+			dm_kcopyd_client_destroy(UDS_FORGET(allocator->eraser));
+
+		uninitialize_allocator_summary(allocator);
+		uninitialize_scrubber_vio(&allocator->scrubber);
+		free_vio_pool(UDS_FORGET(allocator->vio_pool));
+		vdo_free_priority_table(UDS_FORGET(allocator->prioritized_slabs));
+	}
+
+	if (depot->slabs != NULL) {
+		slab_count_t i;
+
+		for (i = 0; i < depot->slab_count; i++)
+			free_slab(UDS_FORGET(depot->slabs[i]));
+	}
+
+	UDS_FREE(UDS_FORGET(depot->slabs));
+	UDS_FREE(UDS_FORGET(depot->action_manager));
+	UDS_FREE(UDS_FORGET(depot->summary_entries));
+	UDS_FREE(depot);
+}
+
+/**
+ * vdo_record_slab_depot() - Record the state of a slab depot for encoding into the super block.
+ * @depot: The depot to encode.
+ *
+ * Return: The depot state.
+ */
+struct slab_depot_state_2_0 vdo_record_slab_depot(const struct slab_depot *depot)
+{
+	/*
+	 * If this depot is currently using 0 zones, it must have been synchronously loaded by a
+	 * tool and is now being saved. We did not load and combine the slab summary, so we still
+	 * need to do that next time we load with the old zone count rather than 0.
+	 */
+	struct slab_depot_state_2_0 state;
+	zone_count_t zones_to_record = depot->zone_count;
+
+	if (depot->zone_count == 0)
+		zones_to_record = depot->old_zone_count;
+
+	state = (struct slab_depot_state_2_0) {
+		.slab_config = depot->slab_config,
+		.first_block = depot->first_block,
+		.last_block = depot->last_block,
+		.zone_count = zones_to_record,
+	};
+
+	return state;
+}
+
+/**
+ * vdo_allocate_reference_counters() - Allocate the reference counters for all slabs in the depot.
+ *
+ * Context: This method may be called only before entering normal operation from the load thread.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_allocate_reference_counters(struct slab_depot *depot)
+{
+	struct slab_iterator iterator =
+		get_depot_slab_iterator(depot, depot->slab_count - 1, 0, 1);
+
+	while (iterator.next != NULL) {
+		int result = allocate_slab_counters(next_slab(&iterator));
+
+		if (result != VDO_SUCCESS)
+			return result;
+	}
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * get_slab_number() - Get the number of the slab that contains a specified block.
+ * @depot: The slab depot.
+ * @pbn: The physical block number.
+ * @slab_number_ptr: A pointer to hold the slab number.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int __must_check get_slab_number(const struct slab_depot *depot,
+					physical_block_number_t pbn,
+					slab_count_t *slab_number_ptr)
+{
+	slab_count_t slab_number;
+
+	if (pbn < depot->first_block)
+		return VDO_OUT_OF_RANGE;
+
+	slab_number = (pbn - depot->first_block) >> depot->slab_size_shift;
+	if (slab_number >= depot->slab_count)
+		return VDO_OUT_OF_RANGE;
+
+	*slab_number_ptr = slab_number;
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_get_slab() - Get the slab object for the slab that contains a specified block.
+ * @depot: The slab depot.
+ * @pbn: The physical block number.
+ *
+ * Will put the VDO in read-only mode if the PBN is not a valid data block nor the zero block.
+ *
+ * Return: The slab containing the block, or NULL if the block number is the zero block or
+ * otherwise out of range.
+ */
+struct vdo_slab *vdo_get_slab(const struct slab_depot *depot, physical_block_number_t pbn)
+{
+	slab_count_t slab_number;
+	int result;
+
+	if (pbn == VDO_ZERO_BLOCK)
+		return NULL;
+
+	result = get_slab_number(depot, pbn, &slab_number);
+	if (result != VDO_SUCCESS) {
+		vdo_enter_read_only_mode(depot->vdo, result);
+		return NULL;
+	}
+
+	return depot->slabs[slab_number];
+}
+
+/**
+ * vdo_get_increment_limit() - Determine how many new references a block can acquire.
+ * @depot: The slab depot.
+ * @pbn: The physical block number that is being queried.
+ *
+ * Context: This method must be called from the physical zone thread of the PBN.
+ *
+ * Return: The number of available references.
+ */
+u8 vdo_get_increment_limit(struct slab_depot *depot, physical_block_number_t pbn)
+{
+	struct vdo_slab *slab = vdo_get_slab(depot, pbn);
+	vdo_refcount_t *counter_ptr = NULL;
+	int result;
+
+	if ((slab == NULL) || (slab->status != VDO_SLAB_REBUILT))
+		return 0;
+
+	result = get_reference_counter(slab, pbn, &counter_ptr);
+	if (result != VDO_SUCCESS)
+		return 0;
+
+	if (*counter_ptr == PROVISIONAL_REFERENCE_COUNT)
+		return (MAXIMUM_REFERENCE_COUNT - 1);
+
+	return (MAXIMUM_REFERENCE_COUNT - *counter_ptr);
+}
+
+/**
+ * vdo_is_physical_data_block() - Determine whether the given PBN refers to a data block.
+ * @depot: The depot.
+ * @pbn: The physical block number to ask about.
+ *
+ * Return: True if the PBN corresponds to a data block.
+ */
+bool vdo_is_physical_data_block(const struct slab_depot *depot, physical_block_number_t pbn)
+{
+	slab_count_t slab_number;
+	slab_block_number sbn;
+
+	return ((pbn == VDO_ZERO_BLOCK) ||
+		((get_slab_number(depot, pbn, &slab_number) == VDO_SUCCESS) &&
+		 (slab_block_number_from_pbn(depot->slabs[slab_number], pbn, &sbn) ==
+		  VDO_SUCCESS)));
+}
+
+/**
+ * vdo_get_slab_depot_allocated_blocks() - Get the total number of data blocks allocated across all
+ * the slabs in the depot.
+ * @depot: The slab depot.
+ *
+ * This is the total number of blocks with a non-zero reference count.
+ *
+ * Context: This may be called from any thread.
+ *
+ * Return: The total number of blocks with a non-zero reference count.
+ */
+block_count_t vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot)
+{
+	block_count_t total = 0;
+	zone_count_t zone;
+
+	for (zone = 0; zone < depot->zone_count; zone++)
+		/* The allocators are responsible for thread safety. */
+		total += READ_ONCE(depot->allocators[zone].allocated_blocks);
+	return total;
+}
+
+/**
+ * vdo_get_slab_depot_data_blocks() - Get the total number of data blocks in all the slabs in the
+ *                                    depot.
+ * @depot: The slab depot.
+ *
+ * Context: This may be called from any thread.
+ *
+ * Return: The total number of data blocks in all slabs.
+ */
+block_count_t vdo_get_slab_depot_data_blocks(const struct slab_depot *depot)
+{
+	return (READ_ONCE(depot->slab_count) * depot->slab_config.data_blocks);
+}
+
+/**
+ * finish_combining_zones() - Clean up after saving out the combined slab summary.
+ * @completion: The vio which was used to write the summary data.
+ */
+static void finish_combining_zones(struct vdo_completion *completion)
+{
+	int result = completion->result;
+	struct vdo_completion *parent = completion->parent;
+
+	free_vio(as_vio(UDS_FORGET(completion)));
+	vdo_fail_completion(parent, result);
+}
+
+static void handle_combining_error(struct vdo_completion *completion)
+{
+	vio_record_metadata_io_error(as_vio(completion));
+	finish_combining_zones(completion);
+}
+
+static void write_summary_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct vdo *vdo = vio->completion.vdo;
+
+	continue_vio_after_io(vio, finish_combining_zones, vdo->thread_config.admin_thread);
+}
+
+/**
+ * combine_summaries() - Treating the current entries buffer as the on-disk value of all zones,
+ *                       update every zone to the correct values for every slab.
+ * @depot: The depot whose summary entries should be combined.
+ */
+static void combine_summaries(struct slab_depot *depot)
+{
+	/*
+	 * Combine all the old summary data into the portion of the buffer corresponding to the
+	 * first zone.
+	 */
+	zone_count_t zone = 0;
+	struct slab_summary_entry *entries = depot->summary_entries;
+
+	if (depot->old_zone_count > 1) {
+		slab_count_t entry_number;
+
+		for (entry_number = 0; entry_number < MAX_VDO_SLABS; entry_number++) {
+			if (zone != 0)
+				memcpy(entries + entry_number,
+				       entries + (zone * MAX_VDO_SLABS) + entry_number,
+				       sizeof(struct slab_summary_entry));
+			zone++;
+			if (zone == depot->old_zone_count)
+				zone = 0;
+		}
+	}
+
+	/* Copy the combined data to each zones's region of the buffer. */
+	for (zone = 1; zone < MAX_VDO_PHYSICAL_ZONES; zone++)
+		memcpy(entries + (zone * MAX_VDO_SLABS),
+		       entries,
+		       MAX_VDO_SLABS * sizeof(struct slab_summary_entry));
+}
+
+/**
+ * finish_loading_summary() - Finish loading slab summary data.
+ * @completion: The vio which was used to read the summary data.
+ *
+ * Combines the slab summary data from all the previously written zones and copies the combined
+ * summary to each partition's data region. Then writes the combined summary back out to disk. This
+ * callback is registered in load_summary_endio().
+ */
+static void finish_loading_summary(struct vdo_completion *completion)
+{
+	struct slab_depot *depot = completion->vdo->depot;
+
+	/* Combine the summary from each zone so each zone is correct for all slabs. */
+	combine_summaries(depot);
+
+	/* Write the combined summary back out. */
+	submit_metadata_vio(as_vio(completion),
+			    depot->summary_origin,
+			    write_summary_endio,
+			    handle_combining_error,
+			    REQ_OP_WRITE);
+}
+
+static void load_summary_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct vdo *vdo = vio->completion.vdo;
+
+	continue_vio_after_io(vio, finish_loading_summary, vdo->thread_config.admin_thread);
+}
+
+/**
+ * load_slab_summary() - The preamble of a load operation.
+ *
+ * Implements vdo_action_preamble.
+ */
+static void load_slab_summary(void *context, struct vdo_completion *parent)
+{
+	int result;
+	struct vio *vio;
+	struct slab_depot *depot = context;
+	const struct admin_state_code *operation =
+		vdo_get_current_manager_operation(depot->action_manager);
+
+	result = create_multi_block_metadata_vio(depot->vdo,
+						 VIO_TYPE_SLAB_SUMMARY,
+						 VIO_PRIORITY_METADATA,
+						 parent,
+						 VDO_SLAB_SUMMARY_BLOCKS,
+						 (char *) depot->summary_entries,
+						 &vio);
+	if (result != VDO_SUCCESS) {
+		vdo_fail_completion(parent, result);
+		return;
+	}
+
+	if ((operation == VDO_ADMIN_STATE_FORMATTING) ||
+	    (operation == VDO_ADMIN_STATE_LOADING_FOR_REBUILD)) {
+		finish_loading_summary(&vio->completion);
+		return;
+	}
+
+	submit_metadata_vio(vio,
+			    depot->summary_origin,
+			    load_summary_endio,
+			    handle_combining_error,
+			    REQ_OP_READ);
+}
+
+/* Implements vdo_zone_action. */
+static void load_allocator(void *context, zone_count_t zone_number, struct vdo_completion *parent)
+{
+	struct slab_depot *depot = context;
+
+	vdo_start_loading(&depot->allocators[zone_number].state,
+			  vdo_get_current_manager_operation(depot->action_manager),
+			  parent,
+			  initiate_load);
+}
+
+/**
+ * vdo_load_slab_depot() - Asynchronously load any slab depot state that isn't included in the
+ *                         super_block component.
+ * @depot: The depot to load.
+ * @operation: The type of load to perform.
+ * @parent: The completion to notify when the load is complete.
+ * @context: Additional context for the load operation; may be NULL.
+ *
+ * This method may be called only before entering normal operation from the load thread.
+ */
+void vdo_load_slab_depot(struct slab_depot *depot,
+			 const struct admin_state_code *operation,
+			 struct vdo_completion *parent,
+			 void *context)
+{
+	if (vdo_assert_load_operation(operation, parent))
+		vdo_schedule_operation_with_context(depot->action_manager,
+						    operation,
+						    load_slab_summary,
+						    load_allocator,
+						    NULL,
+						    context,
+						    parent);
+}
+
+/* Implements vdo_zone_action. */
+static void prepare_to_allocate(void *context,
+				zone_count_t zone_number,
+				struct vdo_completion *parent)
+{
+	struct slab_depot *depot = context;
+	struct block_allocator *allocator = &depot->allocators[zone_number];
+	int result;
+
+	result = vdo_prepare_slabs_for_allocation(allocator);
+	if (result != VDO_SUCCESS) {
+		vdo_fail_completion(parent, result);
+		return;
+	}
+
+	scrub_slabs(allocator, parent);
+}
+
+/**
+ * vdo_prepare_slab_depot_to_allocate() - Prepare the slab depot to come online and start
+ *                                        allocating blocks.
+ * @depot: The depot to prepare.
+ * @load_type: The load type.
+ * @parent: The completion to notify when the operation is complete.
+ *
+ * This method may be called only before entering normal operation from the load thread. It must be
+ * called before allocation may proceed.
+ */
+void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot,
+					enum slab_depot_load_type load_type,
+					struct vdo_completion *parent)
+{
+	depot->load_type = load_type;
+	atomic_set(&depot->zones_to_scrub, depot->zone_count);
+	vdo_schedule_action(depot->action_manager, NULL, prepare_to_allocate, NULL, parent);
+}
+
+/**
+ * vdo_update_slab_depot_size() - Update the slab depot to reflect its new size in memory.
+ * @depot: The depot to update.
+ *
+ * This size is saved to disk as part of the super block.
+ */
+void vdo_update_slab_depot_size(struct slab_depot *depot)
+{
+	depot->last_block = depot->new_last_block;
+}
+
+/**
+ * vdo_prepare_to_grow_slab_depot() - Allocate new memory needed for a resize of a slab depot to
+ *                                    the given size.
+ * @depot: The depot to prepare to resize.
+ * @partition: The new depot partition
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_prepare_to_grow_slab_depot(struct slab_depot *depot, const struct partition *partition)
+{
+	struct slab_depot_state_2_0 new_state;
+	int result;
+	slab_count_t new_slab_count;
+
+	if ((partition->count >> depot->slab_size_shift) <= depot->slab_count)
+		return VDO_INCREMENT_TOO_SMALL;
+
+	/* Generate the depot configuration for the new block count. */
+	ASSERT_LOG_ONLY(depot->first_block == partition->offset,
+			"New slab depot partition doesn't change origin");
+	result = vdo_configure_slab_depot(partition,
+					  depot->slab_config,
+					  depot->zone_count,
+					  &new_state);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	new_slab_count = vdo_compute_slab_count(depot->first_block,
+						new_state.last_block,
+						depot->slab_size_shift);
+	if (new_slab_count <= depot->slab_count)
+		return uds_log_error_strerror(VDO_INCREMENT_TOO_SMALL, "Depot can only grow");
+	if (new_slab_count == depot->new_slab_count)
+		/* Check it out, we've already got all the new slabs allocated! */
+		return VDO_SUCCESS;
+
+	vdo_abandon_new_slabs(depot);
+	result = allocate_slabs(depot, new_slab_count);
+	if (result != VDO_SUCCESS) {
+		vdo_abandon_new_slabs(depot);
+		return result;
+	}
+
+	depot->new_size = partition->count;
+	depot->old_last_block = depot->last_block;
+	depot->new_last_block = new_state.last_block;
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * finish_registration() - Finish registering new slabs now that all of the allocators have
+ *                         received their new slabs.
+ *
+ * Implements vdo_action_conclusion.
+ */
+static int finish_registration(void *context)
+{
+	struct slab_depot *depot = context;
+
+	WRITE_ONCE(depot->slab_count, depot->new_slab_count);
+	UDS_FREE(depot->slabs);
+	depot->slabs = depot->new_slabs;
+	depot->new_slabs = NULL;
+	depot->new_slab_count = 0;
+	return VDO_SUCCESS;
+}
+
+/* Implements vdo_zone_action. */
+static void register_new_slabs(void *context,
+			       zone_count_t zone_number,
+			       struct vdo_completion *parent)
+{
+	struct slab_depot *depot = context;
+	struct block_allocator *allocator = &depot->allocators[zone_number];
+	slab_count_t i;
+
+	for (i = depot->slab_count; i < depot->new_slab_count; i++) {
+		struct vdo_slab *slab = depot->new_slabs[i];
+
+		if (slab->allocator == allocator)
+			register_slab_with_allocator(allocator, slab);
+	}
+
+	vdo_finish_completion(parent);
+}
+
+/**
+ * vdo_use_new_slabs() - Use the new slabs allocated for resize.
+ * @depot: The depot.
+ * @parent: The object to notify when complete.
+ */
+void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent)
+{
+	ASSERT_LOG_ONLY(depot->new_slabs != NULL, "Must have new slabs to use");
+	vdo_schedule_operation(depot->action_manager,
+			       VDO_ADMIN_STATE_SUSPENDED_OPERATION,
+			       NULL,
+			       register_new_slabs,
+			       finish_registration,
+			       parent);
+}
+
+/**
+ * stop_scrubbing() - Tell the scrubber to stop scrubbing after it finishes the slab it is
+ *                    currently working on.
+ * @scrubber: The scrubber to stop.
+ * @parent: The completion to notify when scrubbing has stopped.
+ */
+static void stop_scrubbing(struct block_allocator *allocator)
+{
+	struct slab_scrubber *scrubber = &allocator->scrubber;
+
+	if (vdo_is_state_quiescent(&scrubber->admin_state))
+		vdo_finish_completion(&allocator->completion);
+	else
+		vdo_start_draining(&scrubber->admin_state,
+				   VDO_ADMIN_STATE_SUSPENDING,
+				   &allocator->completion,
+				   NULL);
+}
+
+/**
+ * Implements vdo_admin_initiator.
+ */
+static void initiate_summary_drain(struct admin_state *state)
+{
+	check_summary_drain_complete(container_of(state, struct block_allocator, summary_state));
+}
+
+static void do_drain_step(struct vdo_completion *completion)
+{
+	struct block_allocator *allocator = vdo_as_block_allocator(completion);
+
+	vdo_prepare_completion_for_requeue(&allocator->completion,
+					   do_drain_step,
+					   handle_operation_error,
+					   allocator->thread_id,
+					   NULL);
+	switch (++allocator->drain_step) {
+	case VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER:
+		stop_scrubbing(allocator);
+		return;
+
+	case VDO_DRAIN_ALLOCATOR_STEP_SLABS:
+		apply_to_slabs(allocator, do_drain_step);
+		return;
+
+	case VDO_DRAIN_ALLOCATOR_STEP_SUMMARY:
+		vdo_start_draining(&allocator->summary_state,
+				   vdo_get_admin_state_code(&allocator->state),
+				   completion,
+				   initiate_summary_drain);
+		return;
+
+	case VDO_DRAIN_ALLOCATOR_STEP_FINISHED:
+		ASSERT_LOG_ONLY(!is_vio_pool_busy(allocator->vio_pool), "vio pool not busy");
+		vdo_finish_draining_with_result(&allocator->state, completion->result);
+		return;
+
+	default:
+		vdo_finish_draining_with_result(&allocator->state, UDS_BAD_STATE);
+	}
+}
+
+/* Implements vdo_admin_initiator. */
+static void initiate_drain(struct admin_state *state)
+{
+	struct block_allocator *allocator = container_of(state, struct block_allocator, state);
+
+	allocator->drain_step = VDO_DRAIN_ALLOCATOR_START;
+	do_drain_step(&allocator->completion);
+}
+
+/*
+ * Drain all allocator I/O. Depending upon the type of drain, some or all dirty metadata may be
+ * written to disk. The type of drain will be determined from the state of the allocator's depot.
+ *
+ * Implements vdo_zone_action.
+ */
+static void drain_allocator(void *context, zone_count_t zone_number, struct vdo_completion *parent)
+{
+	struct slab_depot *depot = context;
+
+	vdo_start_draining(&depot->allocators[zone_number].state,
+			   vdo_get_current_manager_operation(depot->action_manager),
+			   parent,
+			   initiate_drain);
+}
+
+/**
+ * vdo_drain_slab_depot() - Drain all slab depot I/O.
+ * @depot: The depot to drain.
+ * @operation: The drain operation (flush, rebuild, suspend, or save).
+ * @parent: The completion to finish when the drain is complete.
+ *
+ * If saving, or flushing, all dirty depot metadata will be written out. If saving or suspending,
+ * the depot will be left in a suspended state.
+ */
+void vdo_drain_slab_depot(struct slab_depot *depot,
+			  const struct admin_state_code *operation,
+			  struct vdo_completion *parent)
+{
+	vdo_schedule_operation(depot->action_manager,
+			       operation,
+			       NULL,
+			       drain_allocator,
+			       NULL,
+			       parent);
+}
+
+/**
+ * resume_scrubbing() - Tell the scrubber to resume scrubbing if it has been stopped.
+ * @alocator: The allocator being resumed.
+ */
+static void resume_scrubbing(struct block_allocator *allocator)
+{
+	int result;
+	struct slab_scrubber *scrubber = &allocator->scrubber;
+
+	if (!has_slabs_to_scrub(scrubber)) {
+		vdo_finish_completion(&allocator->completion);
+		return;
+	}
+
+	result = vdo_resume_if_quiescent(&scrubber->admin_state);
+	if (result != VDO_SUCCESS) {
+		vdo_fail_completion(&allocator->completion, result);
+		return;
+	}
+
+	scrub_next_slab(scrubber);
+	vdo_finish_completion(&allocator->completion);
+}
+
+static void do_resume_step(struct vdo_completion *completion)
+{
+	struct block_allocator *allocator = vdo_as_block_allocator(completion);
+
+	vdo_prepare_completion_for_requeue(&allocator->completion,
+					   do_resume_step,
+					   handle_operation_error,
+					   allocator->thread_id,
+					   NULL);
+	switch (--allocator->drain_step) {
+	case VDO_DRAIN_ALLOCATOR_STEP_SUMMARY:
+		vdo_fail_completion(completion,
+				    vdo_resume_if_quiescent(&allocator->summary_state));
+		return;
+
+	case VDO_DRAIN_ALLOCATOR_STEP_SLABS:
+		apply_to_slabs(allocator, do_resume_step);
+		return;
+
+	case VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER:
+		resume_scrubbing(allocator);
+		return;
+
+	case VDO_DRAIN_ALLOCATOR_START:
+		vdo_finish_resuming_with_result(&allocator->state, completion->result);
+		return;
+
+	default:
+		vdo_finish_resuming_with_result(&allocator->state, UDS_BAD_STATE);
+	}
+}
+
+/* Implements vdo_admin_initiator. */
+static void initiate_resume(struct admin_state *state)
+{
+	struct block_allocator *allocator = container_of(state, struct block_allocator, state);
+
+	allocator->drain_step = VDO_DRAIN_ALLOCATOR_STEP_FINISHED;
+	do_resume_step(&allocator->completion);
+}
+
+/* Implements vdo_zone_action. */
+static void resume_allocator(void *context,
+			     zone_count_t zone_number,
+			     struct vdo_completion *parent)
+{
+	struct slab_depot *depot = context;
+
+	vdo_start_resuming(&depot->allocators[zone_number].state,
+			   vdo_get_current_manager_operation(depot->action_manager),
+			   parent,
+			   initiate_resume);
+}
+
+/**
+ * vdo_resume_slab_depot() - Resume a suspended slab depot.
+ * @depot: The depot to resume.
+ * @parent: The completion to finish when the depot has resumed.
+ */
+void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent)
+{
+	if (vdo_is_read_only(depot->vdo)) {
+		vdo_continue_completion(parent, VDO_READ_ONLY);
+		return;
+	}
+
+	vdo_schedule_operation(depot->action_manager,
+			       VDO_ADMIN_STATE_RESUMING,
+			       NULL,
+			       resume_allocator,
+			       NULL,
+			       parent);
+}
+
+/**
+ * vdo_commit_oldest_slab_journal_tail_blocks() - Commit all dirty tail blocks which are locking a
+ *                                                given recovery journal block.
+ * @depot: The depot.
+ * @recovery_block_number: The sequence number of the recovery journal block whose locks should be
+ *                         released.
+ *
+ * Context: This method must be called from the journal zone thread.
+ */
+void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot,
+						sequence_number_t recovery_block_number)
+{
+	if (depot == NULL)
+		return;
+
+	depot->new_release_request = recovery_block_number;
+	vdo_schedule_default_action(depot->action_manager);
+}
+
+/* Implements vdo_zone_action. */
+static void scrub_all_unrecovered_slabs(void *context,
+					zone_count_t zone_number,
+					struct vdo_completion *parent)
+{
+	struct slab_depot *depot = context;
+
+	scrub_slabs(&depot->allocators[zone_number], NULL);
+	vdo_launch_completion(parent);
+}
+
+/**
+ * vdo_scrub_all_unrecovered_slabs() - Scrub all unrecovered slabs.
+ * @depot: The depot to scrub.
+ * @parent: The object to notify when scrubbing has been launched for all zones.
+ */
+void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot, struct vdo_completion *parent)
+{
+	vdo_schedule_action(depot->action_manager,
+			    NULL,
+			    scrub_all_unrecovered_slabs,
+			    NULL,
+			    parent);
+}
+
+/**
+ * get_block_allocator_statistics() - Get the total of the statistics from all the block
+ *                                          allocators in the depot.
+ * @depot: The slab depot.
+ *
+ * Return: The statistics from all block allocators in the depot.
+ */
+static struct block_allocator_statistics __must_check
+get_block_allocator_statistics(const struct slab_depot *depot)
+{
+	struct block_allocator_statistics totals;
+	zone_count_t zone;
+
+	memset(&totals, 0, sizeof(totals));
+
+	for (zone = 0; zone < depot->zone_count; zone++) {
+		const struct block_allocator *allocator = &depot->allocators[zone];
+		const struct block_allocator_statistics *stats = &allocator->statistics;
+
+		totals.slab_count += allocator->slab_count;
+		totals.slabs_opened += READ_ONCE(stats->slabs_opened);
+		totals.slabs_reopened += READ_ONCE(stats->slabs_reopened);
+	}
+
+	return totals;
+}
+
+/**
+ * get_ref_counts_statistics() - Get the cumulative ref_counts statistics for the depot.
+ * @depot: The slab depot.
+ *
+ * Return: The cumulative statistics for all ref_counts in the depot.
+ */
+static struct ref_counts_statistics __must_check
+get_ref_counts_statistics(const struct slab_depot *depot)
+{
+	struct ref_counts_statistics totals;
+	zone_count_t zone;
+
+	memset(&totals, 0, sizeof(totals));
+
+	for (zone = 0; zone < depot->zone_count; zone++) {
+		totals.blocks_written +=
+			READ_ONCE(depot->allocators[zone].ref_counts_statistics.blocks_written);
+	}
+
+	return totals;
+}
+
+/**
+ * get_depot_slab_journal_statistics() - Get the aggregated slab journal statistics for the depot.
+ * @depot: The slab depot.
+ *
+ * Return: The aggregated statistics for all slab journals in the depot.
+ */
+static struct slab_journal_statistics __must_check
+get_slab_journal_statistics(const struct slab_depot *depot)
+{
+	struct slab_journal_statistics totals;
+	zone_count_t zone;
+
+	memset(&totals, 0, sizeof(totals));
+
+	for (zone = 0; zone < depot->zone_count; zone++) {
+		const struct slab_journal_statistics *stats =
+			&depot->allocators[zone].slab_journal_statistics;
+
+		totals.disk_full_count += READ_ONCE(stats->disk_full_count);
+		totals.flush_count += READ_ONCE(stats->flush_count);
+		totals.blocked_count += READ_ONCE(stats->blocked_count);
+		totals.blocks_written += READ_ONCE(stats->blocks_written);
+		totals.tail_busy_count += READ_ONCE(stats->tail_busy_count);
+	}
+
+	return totals;
+}
+
+/**
+ * vdo_get_slab_depot_statistics() - Get all the vdo_statistics fields that are properties of the
+ *                                   slab depot.
+ * @depot: The slab depot.
+ * @stats: The vdo statistics structure to partially fill.
+ */
+void vdo_get_slab_depot_statistics(const struct slab_depot *depot, struct vdo_statistics *stats)
+{
+	slab_count_t slab_count = READ_ONCE(depot->slab_count);
+	slab_count_t unrecovered = 0;
+	zone_count_t zone;
+
+	for (zone = 0; zone < depot->zone_count; zone++) {
+		/* The allocators are responsible for thread safety. */
+		unrecovered += READ_ONCE(depot->allocators[zone].scrubber.slab_count);
+	}
+
+	stats->recovery_percentage = (slab_count - unrecovered) * 100 / slab_count;
+	stats->allocator = get_block_allocator_statistics(depot);
+	stats->ref_counts = get_ref_counts_statistics(depot);
+	stats->slab_journal = get_slab_journal_statistics(depot);
+	stats->slab_summary = (struct slab_summary_statistics) {
+		.blocks_written = atomic64_read(&depot->summary_statistics.blocks_written),
+	};
+}
+
+/**
+ * vdo_dump_slab_depot() - Dump the slab depot, in a thread-unsafe fashion.
+ * @depot: The slab depot.
+ */
+void vdo_dump_slab_depot(const struct slab_depot *depot)
+{
+	uds_log_info("vdo slab depot");
+	uds_log_info("  zone_count=%u old_zone_count=%u slabCount=%u active_release_request=%llu new_release_request=%llu",
+		     (unsigned int) depot->zone_count,
+		     (unsigned int) depot->old_zone_count,
+		     READ_ONCE(depot->slab_count),
+		     (unsigned long long) depot->active_release_request,
+		     (unsigned long long) depot->new_release_request);
+}
diff --git a/drivers/md/dm-vdo/slab-depot.h b/drivers/md/dm-vdo/slab-depot.h
new file mode 100644
index 00000000000..2f9b3cf4a0d
--- /dev/null
+++ b/drivers/md/dm-vdo/slab-depot.h
@@ -0,0 +1,594 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_SLAB_DEPOT_H
+#define VDO_SLAB_DEPOT_H
+
+#include <linux/atomic.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/list.h>
+
+#include "numeric.h"
+
+#include "admin-state.h"
+#include "completion.h"
+#include "data-vio.h"
+#include "encodings.h"
+#include "physical-zone.h"
+#include "priority-table.h"
+#include "recovery-journal.h"
+#include "statistics.h"
+#include "types.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+/*
+ * A slab_depot is responsible for managing all of the slabs and block allocators of a VDO. It has
+ * a single array of slabs in order to eliminate the need for additional math in order to compute
+ * which physical zone a PBN is in. It also has a block_allocator per zone.
+ *
+ * Load operations are required to be performed on a single thread. Normal operations are assumed
+ * to be performed in the appropriate zone. Allocations and reference count updates must be done
+ * from the thread of their physical zone. Requests to commit slab journal tail blocks from the
+ * recovery journal must be done on the journal zone thread. Save operations are required to be
+ * launched from the same thread as the original load operation.
+ */
+
+enum {
+	/* The number of vios in the vio pool is proportional to the throughput of the VDO. */
+	BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128,
+};
+
+/*
+ * Represents the possible status of a block.
+ */
+enum reference_status {
+	RS_FREE, /* this block is free */
+	RS_SINGLE, /* this block is singly-referenced */
+	RS_SHARED, /* this block is shared */
+	RS_PROVISIONAL /* this block is provisionally allocated */
+};
+
+struct vdo_slab;
+
+struct journal_lock {
+	u16 count;
+	sequence_number_t recovery_start;
+};
+
+struct slab_journal {
+	/* A waiter object for getting a VIO pool entry */
+	struct waiter resource_waiter;
+	/* A waiter object for updating the slab summary */
+	struct waiter slab_summary_waiter;
+	/* A waiter object for getting a vio with which to flush */
+	struct waiter flush_waiter;
+	/* The queue of VIOs waiting to make an entry */
+	struct wait_queue entry_waiters;
+	/* The parent slab reference of this journal */
+	struct vdo_slab *slab;
+
+	/* Whether a tail block commit is pending */
+	bool waiting_to_commit;
+	/* Whether the journal is updating the slab summary */
+	bool updating_slab_summary;
+	/* Whether the journal is adding entries from the entry_waiters queue */
+	bool adding_entries;
+	/* Whether a partial write is in progress */
+	bool partial_write_in_progress;
+
+	/* The oldest block in the journal on disk */
+	sequence_number_t head;
+	/* The oldest block in the journal which may not be reaped */
+	sequence_number_t unreapable;
+	/* The end of the half-open interval of the active journal */
+	sequence_number_t tail;
+	/* The next journal block to be committed */
+	sequence_number_t next_commit;
+	/* The tail sequence number that is written in the slab summary */
+	sequence_number_t summarized;
+	/* The tail sequence number that was last summarized in slab summary */
+	sequence_number_t last_summarized;
+
+	/* The sequence number of the recovery journal lock */
+	sequence_number_t recovery_lock;
+
+	/*
+	 * The number of entries which fit in a single block. Can't use the constant because unit
+	 * tests change this number.
+	 */
+	journal_entry_count_t entries_per_block;
+	/*
+	 * The number of full entries which fit in a single block. Can't use the constant because
+	 * unit tests change this number.
+	 */
+	journal_entry_count_t full_entries_per_block;
+
+	/* The recovery journal of the VDO (slab journal holds locks on it) */
+	struct recovery_journal *recovery_journal;
+
+	/* The statistics shared by all slab journals in our physical zone */
+	struct slab_journal_statistics *events;
+	/* A list of the VIO pool entries for outstanding journal block writes */
+	struct list_head uncommitted_blocks;
+
+	/*
+	 * The current tail block header state. This will be packed into the block just before it
+	 * is written.
+	 */
+	struct slab_journal_block_header tail_header;
+	/* A pointer to a block-sized buffer holding the packed block data */
+	struct packed_slab_journal_block *block;
+
+	/* The number of blocks in the on-disk journal */
+	block_count_t size;
+	/* The number of blocks at which to start pushing reference blocks */
+	block_count_t flushing_threshold;
+	/* The number of blocks at which all reference blocks should be writing */
+	block_count_t flushing_deadline;
+	/* The number of blocks at which to wait for reference blocks to write */
+	block_count_t blocking_threshold;
+	/* The number of blocks at which to scrub the slab before coming online */
+	block_count_t scrubbing_threshold;
+
+	/* This list entry is for block_allocator to keep a queue of dirty journals */
+	struct list_head dirty_entry;
+
+	/* The lock for the oldest unreaped block of the journal */
+	struct journal_lock *reap_lock;
+	/* The locks for each on disk block */
+	struct journal_lock *locks;
+};
+
+/*
+ * Reference_block structure
+ *
+ * Blocks are used as a proxy, permitting saves of partial refcounts.
+ */
+struct reference_block {
+	/* This block waits on the ref_counts to tell it to write */
+	struct waiter waiter;
+	/* The slab to which this reference_block belongs */
+	struct vdo_slab *slab;
+	/* The number of references in this block that represent allocations */
+	block_size_t allocated_count;
+	/* The slab journal block on which this block must hold a lock */
+	sequence_number_t slab_journal_lock;
+	/* The slab journal block which should be released when this block is committed */
+	sequence_number_t slab_journal_lock_to_release;
+	/* The point up to which each sector is accurate on disk */
+	struct journal_point commit_points[VDO_SECTORS_PER_BLOCK];
+	/* Whether this block has been modified since it was written to disk */
+	bool is_dirty;
+	/* Whether this block is currently writing */
+	bool is_writing;
+};
+
+/* The search_cursor represents the saved position of a free block search. */
+struct search_cursor {
+	/* The reference block containing the current search index */
+	struct reference_block *block;
+	/* The position at which to start searching for the next free counter */
+	slab_block_number index;
+	/* The position just past the last valid counter in the current block */
+	slab_block_number end_index;
+
+	/* A pointer to the first reference block in the slab */
+	struct reference_block *first_block;
+	/* A pointer to the last reference block in the slab */
+	struct reference_block *last_block;
+};
+
+enum slab_rebuild_status {
+	VDO_SLAB_REBUILT,
+	VDO_SLAB_REPLAYING,
+	VDO_SLAB_REQUIRES_SCRUBBING,
+	VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING,
+	VDO_SLAB_REBUILDING,
+};
+
+/*
+ * This is the type declaration for the vdo_slab type. A vdo_slab currently consists of a run of
+ * 2^23 data blocks, but that will soon change to dedicate a small number of those blocks for
+ * metadata storage for the reference counts and slab journal for the slab.
+ *
+ * A reference count is maintained for each physical block number. The vast majority of blocks have
+ * a very small reference count (usually 0 or 1). For references less than or equal to MAXIMUM_REFS
+ * (254) the reference count is stored in counters[pbn].
+ */
+struct vdo_slab {
+	/* A list entry to queue this slab in a block_allocator list */
+	struct list_head allocq_entry;
+
+	/* The struct block_allocator that owns this slab */
+	struct block_allocator *allocator;
+
+	/* The journal for this slab */
+	struct slab_journal journal;
+
+	/* The slab number of this slab */
+	slab_count_t slab_number;
+	/* The offset in the allocator partition of the first block in this slab */
+	physical_block_number_t start;
+	/* The offset of the first block past the end of this slab */
+	physical_block_number_t end;
+	/* The starting translated PBN of the slab journal */
+	physical_block_number_t journal_origin;
+	/* The starting translated PBN of the reference counts */
+	physical_block_number_t ref_counts_origin;
+
+	/* The administrative state of the slab */
+	struct admin_state state;
+	/* The status of the slab */
+	enum slab_rebuild_status status;
+	/* Whether the slab was ever queued for scrubbing */
+	bool was_queued_for_scrubbing;
+
+	/* The priority at which this slab has been queued for allocation */
+	u8 priority;
+
+	/* Fields beyond this point are the reference counts for the data blocks in this slab. */
+	/* The size of the counters array */
+	u32 block_count;
+	/* The number of free blocks */
+	u32 free_blocks;
+	/* The array of reference counts */
+	vdo_refcount_t *counters; /* use UDS_ALLOCATE to align data ptr */
+
+	/* The saved block pointer and array indexes for the free block search */
+	struct search_cursor search_cursor;
+
+	/* A list of the dirty blocks waiting to be written out */
+	struct wait_queue dirty_blocks;
+	/* The number of blocks which are currently writing */
+	size_t active_count;
+
+	/* A waiter object for updating the slab summary */
+	struct waiter summary_waiter;
+
+	/* The latest slab journal for which there has been a reference count update */
+	struct journal_point slab_journal_point;
+
+	/* The number of reference count blocks */
+	u32 reference_block_count;
+	/* reference count block array */
+	struct reference_block *reference_blocks;
+};
+
+enum block_allocator_drain_step {
+	VDO_DRAIN_ALLOCATOR_START,
+	VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER,
+	VDO_DRAIN_ALLOCATOR_STEP_SLABS,
+	VDO_DRAIN_ALLOCATOR_STEP_SUMMARY,
+	VDO_DRAIN_ALLOCATOR_STEP_FINISHED,
+};
+
+struct slab_scrubber {
+	/* The queue of slabs to scrub first */
+	struct list_head high_priority_slabs;
+	/* The queue of slabs to scrub once there are no high_priority_slabs */
+	struct list_head slabs;
+	/* The queue of VIOs waiting for a slab to be scrubbed */
+	struct wait_queue waiters;
+
+	/*
+	 * The number of slabs that are unrecovered or being scrubbed. This field is modified by
+	 * the physical zone thread, but is queried by other threads.
+	 */
+	slab_count_t slab_count;
+
+	/* The administrative state of the scrubber */
+	struct admin_state admin_state;
+	/* Whether to only scrub high-priority slabs */
+	bool high_priority_only;
+	/* The slab currently being scrubbed */
+	struct vdo_slab *slab;
+	/* The vio for loading slab journal blocks */
+	struct vio vio;
+};
+
+/* A sub-structure for applying actions in parallel to all an allocator's slabs. */
+struct slab_actor {
+	/* The number of slabs performing a slab action */
+	slab_count_t slab_action_count;
+	/* The method to call when a slab action has been completed by all slabs */
+	vdo_action *callback;
+};
+
+/* A slab_iterator is a structure for iterating over a set of slabs. */
+struct slab_iterator {
+	struct vdo_slab **slabs;
+	struct vdo_slab *next;
+	slab_count_t end;
+	slab_count_t stride;
+};
+
+/*
+ * The slab_summary provides hints during load and recovery about the state of the slabs in order
+ * to avoid the need to read the slab journals in their entirety before a VDO can come online.
+ *
+ * The information in the summary for each slab includes the rough number of free blocks (which is
+ * used to prioritize scrubbing), the cleanliness of a slab (so that clean slabs containing free
+ * space will be used on restart), and the location of the tail block of the slab's journal.
+ *
+ * The slab_summary has its own partition at the end of the volume which is sized to allow for a
+ * complete copy of the summary for each of up to 16 physical zones.
+ *
+ * During resize, the slab_summary moves its backing partition and is saved once moved; the
+ * slab_summary is not permitted to overwrite the previous recovery journal space.
+ *
+ * The slab_summary does not have its own version information, but relies on the VDO volume version
+ * number.
+ */
+
+/*
+ * A slab status is a very small structure for use in determining the ordering of slabs in the
+ * scrubbing process.
+ */
+struct slab_status {
+	slab_count_t slab_number;
+	bool is_clean;
+	u8 emptiness;
+};
+
+struct slab_summary_block {
+	/* The block_allocator to which this block belongs */
+	struct block_allocator *allocator;
+	/* The index of this block in its zone's summary */
+	block_count_t index;
+	/* Whether this block has a write outstanding */
+	bool writing;
+	/* Ring of updates waiting on the outstanding write */
+	struct wait_queue current_update_waiters;
+	/* Ring of updates waiting on the next write */
+	struct wait_queue next_update_waiters;
+	/* The active slab_summary_entry array for this block */
+	struct slab_summary_entry *entries;
+	/* The vio used to write this block */
+	struct vio vio;
+	/* The packed entries, one block long, backing the vio */
+	char *outgoing_entries;
+};
+
+/*
+ * The statistics for all the slab summary zones owned by this slab summary. These fields are all
+ * mutated only by their physical zone threads, but are read by other threads when gathering
+ * statistics for the entire depot.
+ */
+struct atomic_slab_summary_statistics {
+	/* Number of blocks written */
+	atomic64_t blocks_written;
+};
+
+struct block_allocator {
+	struct vdo_completion completion;
+	/* The slab depot for this allocator */
+	struct slab_depot *depot;
+	/* The nonce of the VDO */
+	nonce_t nonce;
+	/* The physical zone number of this allocator */
+	zone_count_t zone_number;
+	/* The thread ID for this allocator's physical zone */
+	thread_id_t thread_id;
+	/* The number of slabs in this allocator */
+	slab_count_t slab_count;
+	/* The number of the last slab owned by this allocator */
+	slab_count_t last_slab;
+	/* The reduced priority level used to preserve unopened slabs */
+	unsigned int unopened_slab_priority;
+	/* The state of this allocator */
+	struct admin_state state;
+	/* The actor for applying an action to all slabs */
+	struct slab_actor slab_actor;
+
+	/* The slab from which blocks are currently being allocated */
+	struct vdo_slab *open_slab;
+	/* A priority queue containing all slabs available for allocation */
+	struct priority_table *prioritized_slabs;
+	/* The slab scrubber */
+	struct slab_scrubber scrubber;
+	/* What phase of the close operation the allocator is to perform */
+	enum block_allocator_drain_step drain_step;
+
+	/*
+	 * These statistics are all mutated only by the physical zone thread, but are read by other
+	 * threads when gathering statistics for the entire depot.
+	 */
+	/*
+	 * The count of allocated blocks in this zone. Not in block_allocator_statistics for
+	 * historical reasons.
+	 */
+	u64 allocated_blocks;
+	/* Statistics for this block allocator */
+	struct block_allocator_statistics statistics;
+	/* Cumulative statistics for the slab journals in this zone */
+	struct slab_journal_statistics slab_journal_statistics;
+	/* Cumulative statistics for the reference counters in this zone */
+	struct ref_counts_statistics ref_counts_statistics;
+
+	/*
+	 * This is the head of a queue of slab journals which have entries in their tail blocks
+	 * which have not yet started to commit. When the recovery journal is under space pressure,
+	 * slab journals which have uncommitted entries holding a lock on the recovery journal head
+	 * are forced to commit their blocks early. This list is kept in order, with the tail
+	 * containing the slab journal holding the most recent recovery journal lock.
+	 */
+	struct list_head dirty_slab_journals;
+
+	/* The vio pool for reading and writing block allocator metadata */
+	struct vio_pool *vio_pool;
+	/* The dm_kcopyd client for erasing slab journals */
+	struct dm_kcopyd_client *eraser;
+	/* Iterator over the slabs to be erased */
+	struct slab_iterator slabs_to_erase;
+
+	/* The portion of the slab summary managed by this allocator */
+	/* The state of the slab summary */
+	struct admin_state summary_state;
+	/* The number of outstanding summary writes */
+	block_count_t summary_write_count;
+	/* The array (owned by the blocks) of all entries */
+	struct slab_summary_entry *summary_entries;
+	/* The array of slab_summary_blocks */
+	struct slab_summary_block *summary_blocks;
+};
+
+enum slab_depot_load_type {
+	VDO_SLAB_DEPOT_NORMAL_LOAD,
+	VDO_SLAB_DEPOT_RECOVERY_LOAD,
+	VDO_SLAB_DEPOT_REBUILD_LOAD
+};
+
+struct slab_depot {
+	zone_count_t zone_count;
+	zone_count_t old_zone_count;
+	struct vdo *vdo;
+	struct slab_config slab_config;
+	struct action_manager *action_manager;
+
+	physical_block_number_t first_block;
+	physical_block_number_t last_block;
+	physical_block_number_t origin;
+
+	/* slab_size == (1 << slab_size_shift) */
+	unsigned int slab_size_shift;
+
+	/* Determines how slabs should be queued during load */
+	enum slab_depot_load_type load_type;
+
+	/* The state for notifying slab journals to release recovery journal */
+	sequence_number_t active_release_request;
+	sequence_number_t new_release_request;
+
+	/* State variables for scrubbing complete handling */
+	atomic_t zones_to_scrub;
+
+	/* Array of pointers to individually allocated slabs */
+	struct vdo_slab **slabs;
+	/* The number of slabs currently allocated and stored in 'slabs' */
+	slab_count_t slab_count;
+
+	/* Array of pointers to a larger set of slabs (used during resize) */
+	struct vdo_slab **new_slabs;
+	/* The number of slabs currently allocated and stored in 'new_slabs' */
+	slab_count_t new_slab_count;
+	/* The size that 'new_slabs' was allocated for */
+	block_count_t new_size;
+
+	/* The last block before resize, for rollback */
+	physical_block_number_t old_last_block;
+	/* The last block after resize, for resize */
+	physical_block_number_t new_last_block;
+
+	/* The statistics for the slab summary */
+	struct atomic_slab_summary_statistics summary_statistics;
+	/* The start of the slab summary partition */
+	physical_block_number_t summary_origin;
+	/* The number of bits to shift to get a 7-bit fullness hint */
+	unsigned int hint_shift;
+	/* The slab summary entries for all of the zones the partition can hold */
+	struct slab_summary_entry *summary_entries;
+
+	/* The block allocators for this depot */
+	struct block_allocator allocators[];
+};
+
+struct reference_updater;
+
+bool __must_check
+vdo_attempt_replay_into_slab(struct vdo_slab *slab,
+			     physical_block_number_t pbn,
+			     enum journal_operation operation,
+			     bool increment,
+			     struct journal_point *recovery_point,
+			     struct vdo_completion *parent);
+
+int __must_check
+vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot,
+				       physical_block_number_t pbn,
+				       enum journal_operation operation);
+
+static inline struct block_allocator *vdo_as_block_allocator(struct vdo_completion *completion)
+{
+	vdo_assert_completion_type(completion, VDO_BLOCK_ALLOCATOR_COMPLETION);
+	return container_of(completion, struct block_allocator, completion);
+}
+
+int __must_check vdo_acquire_provisional_reference(struct vdo_slab *slab,
+						   physical_block_number_t pbn,
+						   struct pbn_lock *lock);
+
+int __must_check
+vdo_allocate_block(struct block_allocator *allocator, physical_block_number_t *block_number_ptr);
+
+int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator, struct waiter *waiter);
+
+void vdo_modify_reference_count(struct vdo_completion *completion,
+				struct reference_updater *updater);
+
+int __must_check vdo_release_block_reference(struct block_allocator *allocator,
+					     physical_block_number_t pbn);
+
+void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion);
+
+void vdo_dump_block_allocator(const struct block_allocator *allocator);
+
+int __must_check vdo_decode_slab_depot(struct slab_depot_state_2_0 state,
+				       struct vdo *vdo,
+				       struct partition *summary_partition,
+				       struct slab_depot **depot_ptr);
+
+void vdo_free_slab_depot(struct slab_depot *depot);
+
+struct slab_depot_state_2_0 __must_check vdo_record_slab_depot(const struct slab_depot *depot);
+
+int __must_check vdo_allocate_reference_counters(struct slab_depot *depot);
+
+struct vdo_slab * __must_check
+vdo_get_slab(const struct slab_depot *depot, physical_block_number_t pbn);
+
+u8 __must_check vdo_get_increment_limit(struct slab_depot *depot, physical_block_number_t pbn);
+
+bool __must_check
+vdo_is_physical_data_block(const struct slab_depot *depot, physical_block_number_t pbn);
+
+block_count_t __must_check vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot);
+
+block_count_t __must_check vdo_get_slab_depot_data_blocks(const struct slab_depot *depot);
+
+void vdo_get_slab_depot_statistics(const struct slab_depot *depot, struct vdo_statistics *stats);
+
+void vdo_load_slab_depot(struct slab_depot *depot,
+			 const struct admin_state_code *operation,
+			 struct vdo_completion *parent,
+			 void *context);
+
+void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot,
+					enum slab_depot_load_type load_type,
+					struct vdo_completion *parent);
+
+void vdo_update_slab_depot_size(struct slab_depot *depot);
+
+int __must_check
+vdo_prepare_to_grow_slab_depot(struct slab_depot *depot, const struct partition *partition);
+
+void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent);
+
+void vdo_abandon_new_slabs(struct slab_depot *depot);
+
+void vdo_drain_slab_depot(struct slab_depot *depot,
+			  const struct admin_state_code *operation,
+			  struct vdo_completion *parent);
+
+void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent);
+
+void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot,
+						sequence_number_t recovery_block_number);
+
+void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot, struct vdo_completion *parent);
+
+void vdo_dump_slab_depot(const struct slab_depot *depot);
+
+#endif /* VDO_SLAB_DEPOT_H */
diff --git a/drivers/md/dm-vdo/statistics.h b/drivers/md/dm-vdo/statistics.h
new file mode 100644
index 00000000000..28f69ea0c8b
--- /dev/null
+++ b/drivers/md/dm-vdo/statistics.h
@@ -0,0 +1,279 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef STATISTICS_H
+#define STATISTICS_H
+
+#include "types.h"
+
+enum {
+	STATISTICS_VERSION = 36,
+};
+
+struct block_allocator_statistics {
+	/** The total number of slabs from which blocks may be allocated */
+	u64 slab_count;
+	/** The total number of slabs from which blocks have ever been allocated */
+	u64 slabs_opened;
+	/** The number of times since loading that a slab has been re-opened */
+	u64 slabs_reopened;
+};
+
+/**
+ * Counters for tracking the number of items written (blocks, requests, etc.)
+ * that keep track of totals at steps in the write pipeline. Three counters
+ * allow the number of buffered, in-memory items and the number of in-flight,
+ * unacknowledged writes to be derived, while still tracking totals for
+ * reporting purposes
+ */
+struct commit_statistics {
+	/** The total number of items on which processing has started */
+	u64 started;
+	/** The total number of items for which a write operation has been issued */
+	u64 written;
+	/** The total number of items for which a write operation has completed */
+	u64 committed;
+};
+
+/** Counters for events in the recovery journal */
+struct recovery_journal_statistics {
+	/** Number of times the on-disk journal was full */
+	u64 disk_full;
+	/** Number of times the recovery journal requested slab journal commits. */
+	u64 slab_journal_commits_requested;
+	/** Write/Commit totals for individual journal entries */
+	struct commit_statistics entries;
+	/** Write/Commit totals for journal blocks */
+	struct commit_statistics blocks;
+};
+
+/** The statistics for the compressed block packer. */
+struct packer_statistics {
+	/** Number of compressed data items written since startup */
+	u64 compressed_fragments_written;
+	/** Number of blocks containing compressed items written since startup */
+	u64 compressed_blocks_written;
+	/** Number of VIOs that are pending in the packer */
+	u64 compressed_fragments_in_packer;
+};
+
+/** The statistics for the slab journals. */
+struct slab_journal_statistics {
+	/** Number of times the on-disk journal was full */
+	u64 disk_full_count;
+	/** Number of times an entry was added over the flush threshold */
+	u64 flush_count;
+	/** Number of times an entry was added over the block threshold */
+	u64 blocked_count;
+	/** Number of times a tail block was written */
+	u64 blocks_written;
+	/** Number of times we had to wait for the tail to write */
+	u64 tail_busy_count;
+};
+
+/** The statistics for the slab summary. */
+struct slab_summary_statistics {
+	/** Number of blocks written */
+	u64 blocks_written;
+};
+
+/** The statistics for the reference counts. */
+struct ref_counts_statistics {
+	/** Number of reference blocks written */
+	u64 blocks_written;
+};
+
+/** The statistics for the block map. */
+struct block_map_statistics {
+	/** number of dirty (resident) pages */
+	u32 dirty_pages;
+	/** number of clean (resident) pages */
+	u32 clean_pages;
+	/** number of free pages */
+	u32 free_pages;
+	/** number of pages in failed state */
+	u32 failed_pages;
+	/** number of pages incoming */
+	u32 incoming_pages;
+	/** number of pages outgoing */
+	u32 outgoing_pages;
+	/** how many times free page not avail */
+	u32 cache_pressure;
+	/** number of get_vdo_page() calls for read */
+	u64 read_count;
+	/** number of get_vdo_page() calls for write */
+	u64 write_count;
+	/** number of times pages failed to read */
+	u64 failed_reads;
+	/** number of times pages failed to write */
+	u64 failed_writes;
+	/** number of gets that are reclaimed */
+	u64 reclaimed;
+	/** number of gets for outgoing pages */
+	u64 read_outgoing;
+	/** number of gets that were already there */
+	u64 found_in_cache;
+	/** number of gets requiring discard */
+	u64 discard_required;
+	/** number of gets enqueued for their page */
+	u64 wait_for_page;
+	/** number of gets that have to fetch */
+	u64 fetch_required;
+	/** number of page fetches */
+	u64 pages_loaded;
+	/** number of page saves */
+	u64 pages_saved;
+	/** the number of flushes issued */
+	u64 flush_count;
+};
+
+/** The dedupe statistics from hash locks */
+struct hash_lock_statistics {
+	/** Number of times the UDS advice proved correct */
+	u64 dedupe_advice_valid;
+	/** Number of times the UDS advice proved incorrect */
+	u64 dedupe_advice_stale;
+	/** Number of writes with the same data as another in-flight write */
+	u64 concurrent_data_matches;
+	/** Number of writes whose hash collided with an in-flight write */
+	u64 concurrent_hash_collisions;
+	/** Current number of dedupe queries that are in flight */
+	u32 curr_dedupe_queries;
+};
+
+/** Counts of error conditions in VDO. */
+struct error_statistics {
+	/** number of times VDO got an invalid dedupe advice PBN from UDS */
+	u64 invalid_advice_pbn_count;
+	/** number of times a VIO completed with a VDO_NO_SPACE error */
+	u64 no_space_error_count;
+	/** number of times a VIO completed with a VDO_READ_ONLY error */
+	u64 read_only_error_count;
+};
+
+struct bio_stats {
+	/** Number of REQ_OP_READ bios */
+	u64 read;
+	/** Number of REQ_OP_WRITE bios with data */
+	u64 write;
+	/** Number of bios tagged with REQ_PREFLUSH and containing no data */
+	u64 empty_flush;
+	/** Number of REQ_OP_DISCARD bios */
+	u64 discard;
+	/** Number of bios tagged with REQ_PREFLUSH */
+	u64 flush;
+	/** Number of bios tagged with REQ_FUA */
+	u64 fua;
+};
+
+struct memory_usage {
+	/** Tracked bytes currently allocated. */
+	u64 bytes_used;
+	/** Maximum tracked bytes allocated. */
+	u64 peak_bytes_used;
+};
+
+/** UDS index statistics */
+struct index_statistics {
+	/** Number of records stored in the index */
+	u64 entries_indexed;
+	/** Number of post calls that found an existing entry */
+	u64 posts_found;
+	/** Number of post calls that added a new entry */
+	u64 posts_not_found;
+	/** Number of query calls that found an existing entry */
+	u64 queries_found;
+	/** Number of query calls that added a new entry */
+	u64 queries_not_found;
+	/** Number of update calls that found an existing entry */
+	u64 updates_found;
+	/** Number of update calls that added a new entry */
+	u64 updates_not_found;
+	/** Number of entries discarded */
+	u64 entries_discarded;
+};
+
+/** The statistics of the vdo service. */
+struct vdo_statistics {
+	u32 version;
+	u32 release_version;
+	/** Number of blocks used for data */
+	u64 data_blocks_used;
+	/** Number of blocks used for VDO metadata */
+	u64 overhead_blocks_used;
+	/** Number of logical blocks that are currently mapped to physical blocks */
+	u64 logical_blocks_used;
+	/** number of physical blocks */
+	block_count_t physical_blocks;
+	/** number of logical blocks */
+	block_count_t logical_blocks;
+	/** Size of the block map page cache, in bytes */
+	u64 block_map_cache_size;
+	/** The physical block size */
+	u64 block_size;
+	/** Number of times the VDO has successfully recovered */
+	u64 complete_recoveries;
+	/** Number of times the VDO has recovered from read-only mode */
+	u64 read_only_recoveries;
+	/** String describing the operating mode of the VDO */
+	char mode[15];
+	/** Whether the VDO is in recovery mode */
+	bool in_recovery_mode;
+	/** What percentage of recovery mode work has been completed */
+	u8 recovery_percentage;
+	/** The statistics for the compressed block packer */
+	struct packer_statistics packer;
+	/** Counters for events in the block allocator */
+	struct block_allocator_statistics allocator;
+	/** Counters for events in the recovery journal */
+	struct recovery_journal_statistics journal;
+	/** The statistics for the slab journals */
+	struct slab_journal_statistics slab_journal;
+	/** The statistics for the slab summary */
+	struct slab_summary_statistics slab_summary;
+	/** The statistics for the reference counts */
+	struct ref_counts_statistics ref_counts;
+	/** The statistics for the block map */
+	struct block_map_statistics block_map;
+	/** The dedupe statistics from hash locks */
+	struct hash_lock_statistics hash_lock;
+	/** Counts of error conditions */
+	struct error_statistics errors;
+	/** The VDO instance */
+	u32 instance;
+	/** Current number of active VIOs */
+	u32 current_vios_in_progress;
+	/** Maximum number of active VIOs */
+	u32 max_vios;
+	/** Number of times the UDS index was too slow in responding */
+	u64 dedupe_advice_timeouts;
+	/** Number of flush requests submitted to the storage device */
+	u64 flush_out;
+	/** Logical block size */
+	u64 logical_block_size;
+	/** Bios submitted into VDO from above */
+	struct bio_stats bios_in;
+	struct bio_stats bios_in_partial;
+	/** Bios submitted onward for user data */
+	struct bio_stats bios_out;
+	/** Bios submitted onward for metadata */
+	struct bio_stats bios_meta;
+	struct bio_stats bios_journal;
+	struct bio_stats bios_page_cache;
+	struct bio_stats bios_out_completed;
+	struct bio_stats bios_meta_completed;
+	struct bio_stats bios_journal_completed;
+	struct bio_stats bios_page_cache_completed;
+	struct bio_stats bios_acknowledged;
+	struct bio_stats bios_acknowledged_partial;
+	/** Current number of bios in progress */
+	struct bio_stats bios_in_progress;
+	/** Memory usage stats. */
+	struct memory_usage memory_usage;
+	/** The statistics for the UDS index */
+	struct index_statistics index;
+};
+
+#endif /* not STATISTICS_H */
diff --git a/drivers/md/dm-vdo/status-codes.c b/drivers/md/dm-vdo/status-codes.c
new file mode 100644
index 00000000000..bbad5a918e1
--- /dev/null
+++ b/drivers/md/dm-vdo/status-codes.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "status-codes.h"
+
+
+#include "errors.h"
+#include "logger.h"
+#include "permassert.h"
+#include "uds-threads.h"
+
+const struct error_info vdo_status_list[] = {
+	{ "VDO_NOT_IMPLEMENTED", "Not implemented" },
+	{ "VDO_OUT_OF_RANGE", "Out of range" },
+	{ "VDO_REF_COUNT_INVALID", "Reference count would become invalid" },
+	{ "VDO_NO_SPACE", "Out of space" },
+	{ "VDO_UNEXPECTED_EOF", "Unexpected EOF on block read" },
+	{ "VDO_BAD_CONFIGURATION", "Bad configuration option" },
+	{ "VDO_SOCKET_ERROR", "Socket error" },
+	{ "VDO_BAD_ALIGNMENT", "Mis-aligned block reference" },
+	{ "VDO_COMPONENT_BUSY", "Prior operation still in progress" },
+	{ "VDO_BAD_PAGE", "Corrupt or incorrect page" },
+	{ "VDO_UNSUPPORTED_VERSION", "Unsupported component version" },
+	{ "VDO_INCORRECT_COMPONENT", "Component id mismatch in decoder" },
+	{ "VDO_PARAMETER_MISMATCH", "Parameters have conflicting values" },
+	{ "VDO_BLOCK_SIZE_TOO_SMALL", "The block size is too small" },
+	{ "VDO_UNKNOWN_PARTITION", "No partition exists with a given id" },
+	{ "VDO_PARTITION_EXISTS", "A partition already exists with a given id" },
+	{ "VDO_NOT_READ_ONLY", "The device is not in read-only mode" },
+	{ "VDO_INCREMENT_TOO_SMALL", "Physical block growth of too few blocks" },
+	{ "VDO_CHECKSUM_MISMATCH", "Incorrect checksum" },
+	{ "VDO_RECOVERY_JOURNAL_FULL", "The recovery journal is full" },
+	{ "VDO_LOCK_ERROR", "A lock is held incorrectly" },
+	{ "VDO_READ_ONLY", "The device is in read-only mode" },
+	{ "VDO_SHUTTING_DOWN", "The device is shutting down" },
+	{ "VDO_CORRUPT_JOURNAL", "Recovery journal entries corrupted" },
+	{ "VDO_TOO_MANY_SLABS", "Exceeds maximum number of slabs supported" },
+	{ "VDO_INVALID_FRAGMENT", "Compressed block fragment is invalid" },
+	{ "VDO_RETRY_AFTER_REBUILD", "Retry operation after rebuilding finishes" },
+	{ "VDO_UNKNOWN_COMMAND", "The extended command is not known" },
+	{ "VDO_COMMAND_ERROR", "Bad extended command parameters" },
+	{ "VDO_CANNOT_DETERMINE_SIZE", "Cannot determine config sizes to fit" },
+	{ "VDO_BAD_MAPPING", "Invalid page mapping" },
+	{ "VDO_READ_CACHE_BUSY", "Read cache has no free slots" },
+	{ "VDO_BIO_CREATION_FAILED", "Bio creation failed" },
+	{ "VDO_BAD_MAGIC", "Bad magic number" },
+	{ "VDO_BAD_NONCE", "Bad nonce" },
+	{ "VDO_JOURNAL_OVERFLOW", "Journal sequence number overflow" },
+	{ "VDO_INVALID_ADMIN_STATE", "Invalid operation for current state" },
+	{ "VDO_CANT_ADD_SYSFS_NODE", "Failed to add sysfs node" },
+};
+
+static atomic_t vdo_status_codes_registered = ATOMIC_INIT(0);
+static int status_code_registration_result;
+
+static void do_status_code_registration(void)
+{
+	int result;
+
+	STATIC_ASSERT((VDO_STATUS_CODE_LAST - VDO_STATUS_CODE_BASE) ==
+		      ARRAY_SIZE(vdo_status_list));
+
+	result = uds_register_error_block("VDO Status",
+					  VDO_STATUS_CODE_BASE,
+					  VDO_STATUS_CODE_BLOCK_END,
+					  vdo_status_list,
+					  sizeof(vdo_status_list));
+	/*
+	 * The following test handles cases where libvdo is statically linked against both the test
+	 * modules and the test driver (because multiple instances of this module call their own
+	 * copy of this function once each, resulting in multiple calls to register_error_block
+	 * which is shared in libuds).
+	 */
+	if (result == UDS_DUPLICATE_NAME)
+		result = UDS_SUCCESS;
+
+	status_code_registration_result = (result == UDS_SUCCESS) ? VDO_SUCCESS : result;
+}
+
+/**
+ * vdo_register_status_codes() - Register the VDO status codes if needed.
+ * Return: A success or error code.
+ */
+int vdo_register_status_codes(void)
+{
+	uds_perform_once(&vdo_status_codes_registered, do_status_code_registration);
+	return status_code_registration_result;
+}
+
+/**
+ * vdo_map_to_system_error() - Given an error code, return a value we can return to the OS.
+ * @error: The error code to convert.
+ *
+ * The input error code may be a system-generated value (such as -EIO), an errno macro used in our
+ * code (such as EIO), or a UDS or VDO status code; the result must be something the rest of the OS
+ * can consume (negative errno values such as -EIO, in the case of the kernel).
+ *
+ * Return: A system error code value.
+ */
+int vdo_map_to_system_error(int error)
+{
+	char error_name[UDS_MAX_ERROR_NAME_SIZE];
+	char error_message[UDS_MAX_ERROR_MESSAGE_SIZE];
+
+	/* 0 is success, negative a system error code */
+	if (likely(error <= 0))
+		return error;
+	if (error < 1024)
+		return -error;
+
+	/* VDO or UDS error */
+	switch (error) {
+	case VDO_NO_SPACE:
+		return -ENOSPC;
+	case VDO_READ_ONLY:
+		return -EIO;
+	default:
+		uds_log_info("%s: mapping internal status code %d (%s: %s) to EIO",
+			     __func__,
+			     error,
+			     uds_string_error_name(error, error_name, sizeof(error_name)),
+			     uds_string_error(error, error_message, sizeof(error_message)));
+		return -EIO;
+	}
+}
diff --git a/drivers/md/dm-vdo/status-codes.h b/drivers/md/dm-vdo/status-codes.h
new file mode 100644
index 00000000000..34ad2445419
--- /dev/null
+++ b/drivers/md/dm-vdo/status-codes.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_STATUS_CODES_H
+#define VDO_STATUS_CODES_H
+
+#include "errors.h"
+
+enum {
+	UDS_BLOCK_SIZE = UDS_ERROR_CODE_BLOCK_END - UDS_ERROR_CODE_BASE,
+	VDO_BLOCK_START = UDS_ERROR_CODE_BLOCK_END,
+	VDO_BLOCK_END = VDO_BLOCK_START + UDS_BLOCK_SIZE,
+	PRP_BLOCK_START = VDO_BLOCK_END,
+	PRP_BLOCK_END = PRP_BLOCK_START + UDS_BLOCK_SIZE,
+};
+
+/* VDO-specific status codes. */
+enum vdo_status_codes {
+	/* successful result */
+	VDO_SUCCESS,
+	/* base of all VDO errors */
+	VDO_STATUS_CODE_BASE = VDO_BLOCK_START,
+	/* we haven't written this yet */
+	VDO_NOT_IMPLEMENTED = VDO_STATUS_CODE_BASE,
+	/* input out of range */
+	VDO_OUT_OF_RANGE,
+	/* an invalid reference count would result */
+	VDO_REF_COUNT_INVALID,
+	/* a free block could not be allocated */
+	VDO_NO_SPACE,
+	/* unexpected EOF on block read */
+	VDO_UNEXPECTED_EOF,
+	/* improper or missing configuration option */
+	VDO_BAD_CONFIGURATION,
+	/* socket opening or binding problem */
+	VDO_SOCKET_ERROR,
+	/* read or write on non-aligned offset */
+	VDO_BAD_ALIGNMENT,
+	/* prior operation still in progress */
+	VDO_COMPONENT_BUSY,
+	/* page contents incorrect or corrupt data */
+	VDO_BAD_PAGE,
+	/* unsupported version of some component */
+	VDO_UNSUPPORTED_VERSION,
+	/* component id mismatch in decoder */
+	VDO_INCORRECT_COMPONENT,
+	/* parameters have conflicting values */
+	VDO_PARAMETER_MISMATCH,
+	/* the block size is too small */
+	VDO_BLOCK_SIZE_TOO_SMALL,
+	/* no partition exists with a given id */
+	VDO_UNKNOWN_PARTITION,
+	/* a partition already exists with a given id */
+	VDO_PARTITION_EXISTS,
+	/* the VDO is not in read-only mode */
+	VDO_NOT_READ_ONLY,
+	/* physical block growth of too few blocks */
+	VDO_INCREMENT_TOO_SMALL,
+	/* incorrect checksum */
+	VDO_CHECKSUM_MISMATCH,
+	/* the recovery journal is full */
+	VDO_RECOVERY_JOURNAL_FULL,
+	/* a lock is held incorrectly */
+	VDO_LOCK_ERROR,
+	/* the VDO is in read-only mode */
+	VDO_READ_ONLY,
+	/* the VDO is shutting down */
+	VDO_SHUTTING_DOWN,
+	/* the recovery journal has corrupt entries */
+	VDO_CORRUPT_JOURNAL,
+	/* exceeds maximum number of slabs supported */
+	VDO_TOO_MANY_SLABS,
+	/* a compressed block fragment is invalid */
+	VDO_INVALID_FRAGMENT,
+	/* action is unsupported while rebuilding */
+	VDO_RETRY_AFTER_REBUILD,
+	/* the extended command is not known */
+	VDO_UNKNOWN_COMMAND,
+	/* bad extended command parameters */
+	VDO_COMMAND_ERROR,
+	/* cannot determine sizes to fit */
+	VDO_CANNOT_DETERMINE_SIZE,
+	/* a block map entry is invalid */
+	VDO_BAD_MAPPING,
+	/* read cache has no free slots */
+	VDO_READ_CACHE_BUSY,
+	/* bio_add_page failed */
+	VDO_BIO_CREATION_FAILED,
+	/* bad magic number */
+	VDO_BAD_MAGIC,
+	/* bad nonce */
+	VDO_BAD_NONCE,
+	/* sequence number overflow */
+	VDO_JOURNAL_OVERFLOW,
+	/* the VDO is not in a state to perform an admin operation */
+	VDO_INVALID_ADMIN_STATE,
+	/* failure adding a sysfs node */
+	VDO_CANT_ADD_SYSFS_NODE,
+	/* one more than last error code */
+	VDO_STATUS_CODE_LAST,
+	VDO_STATUS_CODE_BLOCK_END = VDO_BLOCK_END
+};
+
+extern const struct error_info vdo_status_list[];
+
+int vdo_register_status_codes(void);
+
+int vdo_map_to_system_error(int error);
+
+#endif /* VDO_STATUS_CODES_H */
diff --git a/drivers/md/dm-vdo/sysfs.c b/drivers/md/dm-vdo/sysfs.c
new file mode 100644
index 00000000000..a091933a0a5
--- /dev/null
+++ b/drivers/md/dm-vdo/sysfs.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include <linux/module.h>
+
+#include "logger.h"
+
+#include "constants.h"
+#include "dedupe.h"
+#include "vdo.h"
+
+static int vdo_log_level_show(char *buf, const struct kernel_param *kp)
+{
+	return sprintf(buf, "%s\n", uds_log_priority_to_string(uds_get_log_level()));
+}
+
+static int vdo_log_level_store(const char *buf, const struct kernel_param *kp)
+{
+	static char internal_buf[11];
+
+	int n = strlen(buf);
+
+	if (n > 10)
+		return -EINVAL;
+
+	memset(internal_buf, '\000', sizeof(internal_buf));
+	memcpy(internal_buf, buf, n);
+	if (internal_buf[n - 1] == '\n')
+		internal_buf[n - 1] = '\000';
+	uds_set_log_level(uds_log_string_to_priority(internal_buf));
+	return 0;
+}
+
+
+static int vdo_dedupe_timeout_interval_store(const char *buf, const struct kernel_param *kp)
+{
+	int result = param_set_uint(buf, kp);
+
+	if (result != 0)
+		return result;
+	vdo_set_dedupe_index_timeout_interval(*(uint *)kp->arg);
+	return 0;
+}
+
+static int vdo_min_dedupe_timer_interval_store(const char *buf, const struct kernel_param *kp)
+{
+	int result = param_set_uint(buf, kp);
+
+	if (result != 0)
+		return result;
+	vdo_set_dedupe_index_min_timer_interval(*(uint *)kp->arg);
+	return 0;
+}
+
+static const struct kernel_param_ops log_level_ops = {
+	.set = vdo_log_level_store,
+	.get = vdo_log_level_show,
+};
+
+
+static const struct kernel_param_ops dedupe_timeout_ops = {
+	.set = vdo_dedupe_timeout_interval_store,
+	.get = param_get_uint,
+};
+
+static const struct kernel_param_ops dedupe_timer_ops = {
+	.set = vdo_min_dedupe_timer_interval_store,
+	.get = param_get_uint,
+};
+
+module_param_cb(log_level, &log_level_ops, NULL, 0644);
+
+
+module_param_cb(deduplication_timeout_interval,
+		&dedupe_timeout_ops,
+		&vdo_dedupe_index_timeout_interval,
+		0644);
+
+module_param_cb(min_deduplication_timer_interval,
+		&dedupe_timer_ops,
+		&vdo_dedupe_index_min_timer_interval,
+		0644);
diff --git a/drivers/md/dm-vdo/types.h b/drivers/md/dm-vdo/types.h
new file mode 100644
index 00000000000..ad719427290
--- /dev/null
+++ b/drivers/md/dm-vdo/types.h
@@ -0,0 +1,403 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_TYPES_H
+#define VDO_TYPES_H
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/device-mapper.h>
+#include <linux/list.h>
+#include <linux/compiler_attributes.h>
+#include <linux/types.h>
+
+#include "funnel-queue.h"
+
+/* A size type in blocks. */
+typedef u64 block_count_t;
+
+/* The size of a block. */
+typedef u16 block_size_t;
+
+/* A counter for data_vios */
+typedef u16 data_vio_count_t;
+
+/* A height within a tree. */
+typedef u8 height_t;
+
+/* The logical block number as used by the consumer. */
+typedef u64 logical_block_number_t;
+
+/* The type of the nonce used to identify instances of VDO. */
+typedef u64 nonce_t;
+
+/* A size in pages. */
+typedef u32 page_count_t;
+
+/* A page number. */
+typedef u32 page_number_t;
+
+/*
+ * The physical (well, less logical) block number at which the block is found on the underlying
+ * device.
+ */
+typedef u64 physical_block_number_t;
+
+/*
+ * A release version number. These numbers are used to make the numbering space for component
+ * versions independent across release branches.
+ *
+ * Really an enum, but we have to specify the size for encoding; see release_versions.h for the
+ * enumeration values.
+ */
+typedef u32 release_version_number_t;
+
+/* A count of tree roots. */
+typedef u8 root_count_t;
+
+/* A number of sectors. */
+typedef u8 sector_count_t;
+
+/* A sequence number. */
+typedef u64 sequence_number_t;
+
+/* The offset of a block within a slab. */
+typedef u32 slab_block_number;
+
+/* A size type in slabs. */
+typedef u16 slab_count_t;
+
+/* A slot in a bin or block map page. */
+typedef u16 slot_number_t;
+
+/* typedef thread_count_t - A thread counter. */
+typedef u8 thread_count_t;
+
+/* typedef thread_id_t - A thread ID, vdo threads are numbered sequentially from 0. */
+typedef u8 thread_id_t;
+
+/* A zone counter */
+typedef u8 zone_count_t;
+
+/* The following enums are persisted on storage, so the values must be preserved. */
+
+/* The current operating mode of the VDO. */
+enum vdo_state {
+	VDO_DIRTY = 0,
+	VDO_NEW = 1,
+	VDO_CLEAN = 2,
+	VDO_READ_ONLY_MODE = 3,
+	VDO_FORCE_REBUILD = 4,
+	VDO_RECOVERING = 5,
+	VDO_REPLAYING = 6, /* VDO_REPLAYING is never set anymore, but retained for upgrade */
+	VDO_REBUILD_FOR_UPGRADE = 7,
+
+	/* Keep VDO_STATE_COUNT at the bottom. */
+	VDO_STATE_COUNT
+};
+
+/**
+ * vdo_state_requires_read_only_rebuild() - Check whether a vdo_state indicates
+ * that a read-only rebuild is required.
+ * @state: The vdo_state to check.
+ *
+ * Return: true if the state indicates a rebuild is required
+ */
+static inline bool __must_check vdo_state_requires_read_only_rebuild(enum vdo_state state)
+{
+	return ((state == VDO_FORCE_REBUILD) || (state == VDO_REBUILD_FOR_UPGRADE));
+}
+
+/**
+ * vdo_state_requires_recovery() - Check whether a vdo state indicates that recovery is needed.
+ * @state: The state to check.
+ *
+ * Return: true if the state indicates a recovery is required
+ */
+static inline bool __must_check vdo_state_requires_recovery(enum vdo_state state)
+{
+	return ((state == VDO_DIRTY) || (state == VDO_REPLAYING) || (state == VDO_RECOVERING));
+}
+
+/*
+ * The current operation on a physical block (from the point of view of the recovery journal, slab
+ * journals, and reference counts.
+ */
+enum journal_operation {
+	VDO_JOURNAL_DATA_REMAPPING = 0,
+	VDO_JOURNAL_BLOCK_MAP_REMAPPING = 1,
+} __packed;
+
+/* Partition IDs encoded in the volume layout in the super block. */
+enum partition_id {
+	VDO_BLOCK_MAP_PARTITION = 0,
+	VDO_SLAB_DEPOT_PARTITION = 1,
+	VDO_RECOVERY_JOURNAL_PARTITION = 2,
+	VDO_SLAB_SUMMARY_PARTITION = 3,
+} __packed;
+
+/* Metadata types for the vdo. */
+enum vdo_metadata_type {
+	VDO_METADATA_RECOVERY_JOURNAL = 1,
+	VDO_METADATA_SLAB_JOURNAL = 2,
+	VDO_METADATA_RECOVERY_JOURNAL_2 = 3,
+} __packed;
+
+/* A position in the block map where a block map entry is stored. */
+struct block_map_slot {
+	physical_block_number_t pbn;
+	slot_number_t slot;
+};
+
+/*
+ * Four bits of each five-byte block map entry contain a mapping state value used to distinguish
+ * unmapped or trimmed logical blocks (which are treated as mapped to the zero block) from entries
+ * that have been mapped to a physical block, including the zero block.
+ *
+ * FIXME: these should maybe be defines.
+ */
+enum block_mapping_state {
+	VDO_MAPPING_STATE_UNMAPPED = 0, /* Must be zero to be the default value */
+	VDO_MAPPING_STATE_UNCOMPRESSED = 1, /* A normal (uncompressed) block */
+	VDO_MAPPING_STATE_COMPRESSED_BASE = 2, /* Compressed in slot 0 */
+	VDO_MAPPING_STATE_COMPRESSED_MAX = 15, /* Compressed in slot 13 */
+};
+
+enum {
+	VDO_MAX_COMPRESSION_SLOTS =
+		(VDO_MAPPING_STATE_COMPRESSED_MAX - VDO_MAPPING_STATE_COMPRESSED_BASE + 1),
+};
+
+
+struct data_location {
+	physical_block_number_t pbn;
+	enum block_mapping_state state;
+};
+
+/* The configuration of a single slab derived from the configured block size and slab size. */
+struct slab_config {
+	/* total number of blocks in the slab */
+	block_count_t slab_blocks;
+	/* number of blocks available for data */
+	block_count_t data_blocks;
+	/* number of blocks for reference counts */
+	block_count_t reference_count_blocks;
+	/* number of blocks for the slab journal */
+	block_count_t slab_journal_blocks;
+	/*
+	 * Number of blocks after which the slab journal starts pushing out a reference_block for
+	 * each new entry it receives.
+	 */
+	block_count_t slab_journal_flushing_threshold;
+	/*
+	 * Number of blocks after which the slab journal pushes out all reference_blocks and makes
+	 * all vios wait.
+	 */
+	block_count_t slab_journal_blocking_threshold;
+	/* Number of blocks after which the slab must be scrubbed before coming online. */
+	block_count_t slab_journal_scrubbing_threshold;
+} __packed;
+
+/*
+ * This structure is memcmp'd for equality. Keep it packed and don't add any fields that are not
+ * properly set in both extant and parsed configs.
+ */
+struct thread_count_config {
+	unsigned int bio_ack_threads;
+	unsigned int bio_threads;
+	unsigned int bio_rotation_interval;
+	unsigned int cpu_threads;
+	unsigned int logical_zones;
+	unsigned int physical_zones;
+	unsigned int hash_zones;
+} __packed;
+
+struct device_config {
+	struct dm_target *owning_target;
+	struct dm_dev *owned_device;
+	struct vdo *vdo;
+	/* All configs referencing a layer are kept on a list in the layer */
+	struct list_head config_list;
+	char *original_string;
+	unsigned int version;
+	char *parent_device_name;
+	block_count_t physical_blocks;
+	/*
+	 * This is the number of logical blocks from VDO's internal point of view. It is the number
+	 * of 4K blocks regardles of the value of the logical_block_size parameter below.
+	 */
+	block_count_t logical_blocks;
+	unsigned int logical_block_size;
+	unsigned int cache_size;
+	unsigned int block_map_maximum_age;
+	bool deduplication;
+	bool compression;
+	struct thread_count_config thread_counts;
+	block_count_t max_discard_blocks;
+};
+
+enum vdo_completion_type {
+	/* Keep VDO_UNSET_COMPLETION_TYPE at the top. */
+	VDO_UNSET_COMPLETION_TYPE,
+	VDO_ACTION_COMPLETION,
+	VDO_ADMIN_COMPLETION,
+	VDO_BLOCK_ALLOCATOR_COMPLETION,
+	VDO_DATA_VIO_POOL_COMPLETION,
+	VDO_DECREMENT_COMPLETION,
+	VDO_FLUSH_COMPLETION,
+	VDO_FLUSH_NOTIFICATION_COMPLETION,
+	VDO_GENERATION_FLUSHED_COMPLETION,
+	VDO_HASH_ZONE_COMPLETION,
+	VDO_HASH_ZONES_COMPLETION,
+	VDO_LOCK_COUNTER_COMPLETION,
+	VDO_PAGE_COMPLETION,
+	VDO_READ_ONLY_MODE_COMPLETION,
+	VDO_REPAIR_COMPLETION,
+	VDO_SYNC_COMPLETION,
+	VIO_COMPLETION,
+} __packed;
+
+struct vdo_completion;
+
+/**
+ * typedef vdo_action - An asynchronous VDO operation.
+ * @completion: The completion of the operation.
+ */
+typedef void vdo_action(struct vdo_completion *completion);
+
+enum vdo_completion_priority {
+	BIO_ACK_Q_ACK_PRIORITY = 0,
+	BIO_ACK_Q_MAX_PRIORITY = 0,
+	BIO_Q_COMPRESSED_DATA_PRIORITY = 0,
+	BIO_Q_DATA_PRIORITY = 0,
+	BIO_Q_FLUSH_PRIORITY = 2,
+	BIO_Q_HIGH_PRIORITY = 2,
+	BIO_Q_METADATA_PRIORITY = 1,
+	BIO_Q_VERIFY_PRIORITY = 1,
+	BIO_Q_MAX_PRIORITY = 2,
+	CPU_Q_COMPLETE_VIO_PRIORITY = 0,
+	CPU_Q_COMPLETE_READ_PRIORITY = 0,
+	CPU_Q_COMPRESS_BLOCK_PRIORITY = 0,
+	CPU_Q_EVENT_REPORTER_PRIORITY = 0,
+	CPU_Q_HASH_BLOCK_PRIORITY = 0,
+	CPU_Q_MAX_PRIORITY = 0,
+	UDS_Q_PRIORITY = 0,
+	UDS_Q_MAX_PRIORITY = 0,
+	VDO_DEFAULT_Q_COMPLETION_PRIORITY = 1,
+	VDO_DEFAULT_Q_FLUSH_PRIORITY = 2,
+	VDO_DEFAULT_Q_MAP_BIO_PRIORITY = 0,
+	VDO_DEFAULT_Q_SYNC_PRIORITY = 2,
+	VDO_DEFAULT_Q_VIO_CALLBACK_PRIORITY = 1,
+	VDO_DEFAULT_Q_MAX_PRIORITY = 2,
+	/* The maximum allowable priority */
+	VDO_WORK_Q_MAX_PRIORITY = 2,
+	/* A value which must be out of range for a valid priority */
+	VDO_WORK_Q_DEFAULT_PRIORITY = VDO_WORK_Q_MAX_PRIORITY + 1,
+};
+
+struct vdo_completion {
+	/* The type of completion this is */
+	enum vdo_completion_type type;
+
+	/*
+	 * <code>true</code> once the processing of the operation is complete. This flag should not
+	 * be used by waiters external to the VDO base as it is used to gate calling the callback.
+	 */
+	bool complete;
+
+	/*
+	 * If true, queue this completion on the next callback invocation, even if it is already
+	 * running on the correct thread.
+	 */
+	bool requeue;
+
+	/* The ID of the thread which should run the next callback */
+	thread_id_t callback_thread_id;
+
+	/* The result of the operation */
+	int result;
+
+	/* The VDO on which this completion operates */
+	struct vdo *vdo;
+
+	/* The callback which will be called once the operation is complete */
+	vdo_action *callback;
+
+	/* Callback which, if set, will be called if an error result is set */
+	vdo_action *error_handler;
+
+	/* The parent object, if any, that spawned this completion */
+	void *parent;
+
+	/* Entry link for lock-free work queue */
+	struct funnel_queue_entry work_queue_entry_link;
+	enum vdo_completion_priority priority;
+	struct vdo_work_queue *my_queue;
+	u64 enqueue_time;
+};
+
+struct block_allocator;
+struct data_vio;
+struct vdo;
+struct vdo_config;
+
+/* vio types for statistics and instrumentation. */
+enum vio_type {
+	VIO_TYPE_UNINITIALIZED = 0,
+	VIO_TYPE_DATA,
+	VIO_TYPE_BLOCK_ALLOCATOR,
+	VIO_TYPE_BLOCK_MAP,
+	VIO_TYPE_BLOCK_MAP_INTERIOR,
+	VIO_TYPE_GEOMETRY,
+	VIO_TYPE_PARTITION_COPY,
+	VIO_TYPE_RECOVERY_JOURNAL,
+	VIO_TYPE_SLAB_JOURNAL,
+	VIO_TYPE_SLAB_SUMMARY,
+	VIO_TYPE_SUPER_BLOCK,
+} __packed;
+
+/* Priority levels for asynchronous I/O operations performed on a vio. */
+enum vio_priority {
+	VIO_PRIORITY_LOW = 0,
+	VIO_PRIORITY_DATA = VIO_PRIORITY_LOW,
+	VIO_PRIORITY_COMPRESSED_DATA = VIO_PRIORITY_DATA,
+	VIO_PRIORITY_METADATA,
+	VIO_PRIORITY_HIGH,
+} __packed;
+
+/*
+ * A wrapper for a bio. All I/O to the storage below a vdo is conducted via vios.
+ */
+struct vio {
+	/* The completion for this vio */
+	struct vdo_completion completion;
+
+	/* The bio zone in which I/O should be processed */
+	zone_count_t bio_zone;
+
+	/* The queueing priority of the vio operation */
+	enum vio_priority priority;
+
+	/* The vio type is used for statistics and instrumentation. */
+	enum vio_type type;
+
+	/* The size of this vio in blocks */
+	unsigned int block_count;
+
+	/* The data being read or written. */
+	char *data;
+
+	/* The VDO-owned bio to use for all IO for this vio */
+	struct bio *bio;
+
+	/*
+	 * A list of enqueued bios with consecutive block numbers, stored by vdo_submit_bio() under
+	 * the first-enqueued vio. The other vios are found via their bio entries in this list, and
+	 * are not added to the work queue as separate completions.
+	 */
+	struct bio_list bios_merged;
+};
+
+#endif /* VDO_TYPES_H */
diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c
new file mode 100644
index 00000000000..c85d349a5fb
--- /dev/null
+++ b/drivers/md/dm-vdo/vdo.c
@@ -0,0 +1,1846 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+/*
+ * This file contains the main entry points for normal operations on a vdo as well as functions for
+ * constructing and destroying vdo instances (in memory).
+ */
+
+/**
+ * DOC:
+ *
+ * A read_only_notifier has a single completion which is used to perform read-only notifications,
+ * however, vdo_enter_read_only_mode() may be called from any thread. A pair of fields, protected
+ * by a spinlock, are used to control the read-only mode entry process. The first field holds the
+ * read-only error. The second is the state field, which may hold any of the four special values
+ * enumerated here.
+ *
+ * When vdo_enter_read_only_mode() is called from some vdo thread, if the read_only_error field
+ * already contains an error (i.e. its value is not VDO_SUCCESS), then some other error has already
+ * initiated the read-only process, and nothing more is done. Otherwise, the new error is stored in
+ * the read_only_error field, and the state field is consulted. If the state is MAY_NOTIFY, it is
+ * set to NOTIFYING, and the notification process begins. If the state is MAY_NOT_NOTIFY, then
+ * notifications are currently disallowed, generally due to the vdo being suspended. In this case,
+ * the nothing more will be done until the vdo is resumed, at which point the notification will be
+ * performed. In any other case, the vdo is already read-only, and there is nothing more to do.
+ */
+
+#include "vdo.h"
+
+#include <linux/completion.h>
+#include <linux/device-mapper.h>
+#include <linux/kernel.h>
+#include <linux/lz4.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+#include "string-utils.h"
+
+#include "block-map.h"
+#include "completion.h"
+#include "data-vio.h"
+#include "dedupe.h"
+#include "encodings.h"
+#include "io-submitter.h"
+#include "logical-zone.h"
+#include "packer.h"
+#include "physical-zone.h"
+#include "pool-sysfs.h"
+#include "recovery-journal.h"
+#include "release-versions.h"
+#include "slab-depot.h"
+#include "statistics.h"
+#include "status-codes.h"
+#include "vio.h"
+#include "work-queue.h"
+
+
+enum { PARANOID_THREAD_CONSISTENCY_CHECKS = 0 };
+
+struct sync_completion {
+	struct vdo_completion vdo_completion;
+	struct completion completion;
+};
+
+/*
+ * We don't expect this set to ever get really large, so a linked list is adequate. We can use a
+ * pointer_map if we need to later.
+ */
+struct device_registry {
+	struct list_head links;
+	/* TODO: Convert to rcu per kernel recommendation. */
+	rwlock_t lock;
+};
+
+static struct device_registry registry;
+
+/**
+ * vdo_initialize_device_registry_once() - Initialize the necessary structures for the device
+ *                                         registry.
+ */
+void vdo_initialize_device_registry_once(void)
+{
+	INIT_LIST_HEAD(&registry.links);
+	rwlock_init(&registry.lock);
+}
+
+/** vdo_is_equal() - Implements vdo_filter_t. */
+static bool vdo_is_equal(struct vdo *vdo, const void *context)
+{
+	return ((void *) vdo == context);
+}
+
+/**
+ * filter_vdos_locked() - Find a vdo in the registry if it exists there.
+ * @filter: The filter function to apply to devices.
+ * @context: A bit of context to provide the filter.
+ *
+ * Context: Must be called holding the lock.
+ *
+ * Return: the vdo object found, if any.
+ */
+static struct vdo * __must_check filter_vdos_locked(vdo_filter_t *filter, const void *context)
+{
+	struct vdo *vdo;
+
+	list_for_each_entry(vdo, &registry.links, registration)
+		if (filter(vdo, context))
+			return vdo;
+
+	return NULL;
+}
+
+/**
+ * vdo_find_matching() - Find and return the first (if any) vdo matching a given filter function.
+ * @filter: The filter function to apply to vdos.
+ * @context: A bit of context to provide the filter.
+ */
+struct vdo *vdo_find_matching(vdo_filter_t *filter, const void *context)
+{
+	struct vdo *vdo;
+
+	read_lock(&registry.lock);
+	vdo = filter_vdos_locked(filter, context);
+	read_unlock(&registry.lock);
+	return vdo;
+}
+
+static void start_vdo_request_queue(void *ptr)
+{
+	struct vdo_thread *thread = vdo_get_work_queue_owner(vdo_get_current_work_queue());
+
+	uds_register_allocating_thread(&thread->allocating_thread,
+				       &thread->vdo->allocations_allowed);
+}
+
+static void finish_vdo_request_queue(void *ptr)
+{
+	uds_unregister_allocating_thread();
+}
+
+#ifdef MODULE
+#define MODULE_NAME THIS_MODULE->name
+#else
+#define MODULE_NAME "dm-vdo"
+#endif  /* MODULE */
+
+static const struct vdo_work_queue_type default_queue_type = {
+	.start = start_vdo_request_queue,
+	.finish = finish_vdo_request_queue,
+	.max_priority = VDO_DEFAULT_Q_MAX_PRIORITY,
+	.default_priority = VDO_DEFAULT_Q_COMPLETION_PRIORITY,
+};
+
+static const struct vdo_work_queue_type bio_ack_q_type = {
+	.start = NULL,
+	.finish = NULL,
+	.max_priority = BIO_ACK_Q_MAX_PRIORITY,
+	.default_priority = BIO_ACK_Q_ACK_PRIORITY,
+};
+
+static const struct vdo_work_queue_type cpu_q_type = {
+	.start = NULL,
+	.finish = NULL,
+	.max_priority = CPU_Q_MAX_PRIORITY,
+	.default_priority = CPU_Q_MAX_PRIORITY,
+};
+
+static void uninitialize_thread_config(struct thread_config *config)
+{
+	UDS_FREE(UDS_FORGET(config->logical_threads));
+	UDS_FREE(UDS_FORGET(config->physical_threads));
+	UDS_FREE(UDS_FORGET(config->hash_zone_threads));
+	UDS_FREE(UDS_FORGET(config->bio_threads));
+	memset(config, 0, sizeof(struct thread_config));
+}
+
+static void
+assign_thread_ids(struct thread_config *config, thread_id_t thread_ids[], zone_count_t count)
+{
+	zone_count_t zone;
+
+	for (zone = 0; zone < count; zone++)
+		thread_ids[zone] = config->thread_count++;
+}
+
+/**
+ * initialize_thread_config() - Initialize the thread mapping
+ *
+ * If the logical, physical, and hash zone counts are all 0, a single thread will be shared by all
+ * three plus the packer and recovery journal. Otherwise, there must be at least one of each type,
+ * and each will have its own thread, as will the packer and recovery journal.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int __must_check
+initialize_thread_config(struct thread_count_config counts, struct thread_config *config)
+{
+	int result;
+	bool single = ((counts.logical_zones + counts.physical_zones + counts.hash_zones) == 0);
+
+	config->bio_thread_count = counts.bio_threads;
+	if (single) {
+		config->logical_zone_count = 1;
+		config->physical_zone_count = 1;
+		config->hash_zone_count = 1;
+	} else {
+		config->logical_zone_count = counts.logical_zones;
+		config->physical_zone_count = counts.physical_zones;
+		config->hash_zone_count = counts.hash_zones;
+	}
+
+	result = UDS_ALLOCATE(config->logical_zone_count,
+			      thread_id_t,
+			      "logical thread array",
+			      &config->logical_threads);
+	if (result != VDO_SUCCESS) {
+		uninitialize_thread_config(config);
+		return result;
+	}
+
+	result = UDS_ALLOCATE(config->physical_zone_count,
+			      thread_id_t,
+			      "physical thread array",
+			      &config->physical_threads);
+	if (result != VDO_SUCCESS) {
+		uninitialize_thread_config(config);
+		return result;
+	}
+
+	result = UDS_ALLOCATE(config->hash_zone_count,
+			      thread_id_t,
+			      "hash thread array",
+			      &config->hash_zone_threads);
+	if (result != VDO_SUCCESS) {
+		uninitialize_thread_config(config);
+		return result;
+	}
+
+	result = UDS_ALLOCATE(config->bio_thread_count,
+			      thread_id_t,
+			      "bio thread array",
+			      &config->bio_threads);
+	if (result != VDO_SUCCESS) {
+		uninitialize_thread_config(config);
+		return result;
+	}
+
+	if (single) {
+		config->logical_threads[0] = config->thread_count;
+		config->physical_threads[0] = config->thread_count;
+		config->hash_zone_threads[0] = config->thread_count++;
+	} else {
+		config->admin_thread = config->thread_count;
+		config->journal_thread = config->thread_count++;
+		config->packer_thread = config->thread_count++;
+		assign_thread_ids(config, config->logical_threads, counts.logical_zones);
+		assign_thread_ids(config, config->physical_threads, counts.physical_zones);
+		assign_thread_ids(config, config->hash_zone_threads, counts.hash_zones);
+	}
+
+	config->dedupe_thread = config->thread_count++;
+	config->bio_ack_thread =
+		((counts.bio_ack_threads > 0) ? config->thread_count++ : VDO_INVALID_THREAD_ID);
+	config->cpu_thread = config->thread_count++;
+	assign_thread_ids(config, config->bio_threads, counts.bio_threads);
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_read_geometry_block() - Synchronously read the geometry block from a vdo's underlying block
+ *                             device.
+ * @vdo: The vdo whose geometry is to be read.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int __must_check read_geometry_block(struct vdo *vdo)
+{
+	struct vio *vio;
+	char *block;
+	int result;
+
+	result = UDS_ALLOCATE(VDO_BLOCK_SIZE, u8, __func__, &block);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = create_metadata_vio(vdo, VIO_TYPE_GEOMETRY, VIO_PRIORITY_HIGH, NULL, block, &vio);
+	if (result != VDO_SUCCESS) {
+		UDS_FREE(block);
+		return result;
+	}
+
+	/*
+	 * This is only safe because, having not already loaded the geometry, the vdo's geometry's
+	 * bio_offset field is 0, so the fact that vio_reset_bio() will substract that offset from
+	 * the supplied pbn is not a problem.
+	 */
+	result = vio_reset_bio(vio, block, NULL, REQ_OP_READ, VDO_GEOMETRY_BLOCK_LOCATION);
+	if (result != VDO_SUCCESS) {
+		free_vio(UDS_FORGET(vio));
+		UDS_FREE(block);
+		return result;
+	}
+
+	bio_set_dev(vio->bio, vdo_get_backing_device(vdo));
+	submit_bio_wait(vio->bio);
+	result = blk_status_to_errno(vio->bio->bi_status);
+	free_vio(UDS_FORGET(vio));
+	if (result != 0) {
+		uds_log_error_strerror(result, "synchronous read failed");
+		UDS_FREE(block);
+		return -EIO;
+	}
+
+	result = vdo_parse_geometry_block((u8 *) block, &vdo->geometry);
+	UDS_FREE(block);
+	return result;
+}
+
+static bool get_zone_thread_name(const thread_id_t thread_ids[],
+				 zone_count_t count,
+				 thread_id_t id,
+				 const char *prefix,
+				 char *buffer,
+				 size_t buffer_length)
+{
+	if (id >= thread_ids[0]) {
+		thread_id_t index = id - thread_ids[0];
+
+		if (index < count) {
+			snprintf(buffer, buffer_length, "%s%d", prefix, index);
+			return true;
+		}
+	}
+	return false;
+}
+
+/**
+ * get_thread_name() - Format the name of the worker thread desired to support a given work queue.
+ * @thread_config: The thread configuration.
+ * @thread_id: The thread id.
+ * @buffer: Where to put the formatted name.
+ * @buffer_length: Size of the output buffer.
+ *
+ * The physical layer may add a prefix identifying the product; the output from this function
+ * should just identify the thread.
+ */
+static void
+get_thread_name(const struct thread_config *thread_config,
+		thread_id_t thread_id,
+		char *buffer,
+		size_t buffer_length)
+{
+	if (thread_id == thread_config->journal_thread) {
+		if (thread_config->packer_thread == thread_id) {
+			/*
+			 * This is the "single thread" config where one thread is used for the
+			 * journal, packer, logical, physical, and hash zones. In that case, it is
+			 * known as the "request queue."
+			 */
+			snprintf(buffer, buffer_length, "reqQ");
+			return;
+		}
+
+		snprintf(buffer, buffer_length, "journalQ");
+		return;
+	} else if (thread_id == thread_config->admin_thread) {
+		/* Theoretically this could be different from the journal thread. */
+		snprintf(buffer, buffer_length, "adminQ");
+		return;
+	} else if (thread_id == thread_config->packer_thread) {
+		snprintf(buffer, buffer_length, "packerQ");
+		return;
+	} else if (thread_id == thread_config->dedupe_thread) {
+		snprintf(buffer, buffer_length, "dedupeQ");
+		return;
+	} else if (thread_id == thread_config->bio_ack_thread) {
+		snprintf(buffer, buffer_length, "ackQ");
+		return;
+	} else if (thread_id == thread_config->cpu_thread) {
+		snprintf(buffer, buffer_length, "cpuQ");
+		return;
+	}
+
+	if (get_zone_thread_name(thread_config->logical_threads,
+				 thread_config->logical_zone_count,
+				 thread_id,
+				 "logQ",
+				 buffer,
+				 buffer_length))
+		return;
+
+	if (get_zone_thread_name(thread_config->physical_threads,
+				 thread_config->physical_zone_count,
+				 thread_id,
+				 "physQ",
+				 buffer,
+				 buffer_length))
+		return;
+
+	if (get_zone_thread_name(thread_config->hash_zone_threads,
+				 thread_config->hash_zone_count,
+				 thread_id,
+				 "hashQ",
+				 buffer,
+				 buffer_length))
+		return;
+
+	if (get_zone_thread_name(thread_config->bio_threads,
+				 thread_config->bio_thread_count,
+				 thread_id,
+				 "bioQ",
+				 buffer,
+				 buffer_length))
+		return;
+
+	/* Some sort of misconfiguration? */
+	snprintf(buffer, buffer_length, "reqQ%d", thread_id);
+}
+
+/**
+ * vdo_make_thread() - Construct a single vdo work_queue and its associated thread (or threads for
+ *                     round-robin queues).
+ * @vdo: The vdo which owns the thread.
+ * @thread_id: The id of the thread to create (as determined by the thread_config).
+ * @type: The description of the work queue for this thread.
+ * @queue_count: The number of actual threads/queues contained in the "thread".
+ * @contexts: An array of queue_count contexts, one for each individual queue; may be NULL.
+ *
+ * Each "thread" constructed by this method is represented by a unique thread id in the thread
+ * config, and completions can be enqueued to the queue and run on the threads comprising this
+ * entity.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_make_thread(struct vdo *vdo,
+		    thread_id_t thread_id,
+		    const struct vdo_work_queue_type *type,
+		    unsigned int queue_count,
+		    void *contexts[])
+{
+	struct vdo_thread *thread = &vdo->threads[thread_id];
+	char queue_name[MAX_VDO_WORK_QUEUE_NAME_LEN];
+
+	if (type == NULL)
+		type = &default_queue_type;
+
+	if (thread->queue != NULL)
+		return ASSERT(vdo_work_queue_type_is(thread->queue, type),
+			      "already constructed vdo thread %u is of the correct type",
+			      thread_id);
+
+	thread->vdo = vdo;
+	thread->thread_id = thread_id;
+	get_thread_name(&vdo->thread_config, thread_id, queue_name, sizeof(queue_name));
+	return vdo_make_work_queue(vdo->thread_name_prefix,
+				   queue_name,
+				   thread,
+				   type,
+				   queue_count,
+				   contexts,
+				   &thread->queue);
+}
+
+/**
+ * register_vdo() - Register a VDO; it must not already be registered.
+ * @vdo: The vdo to register.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int register_vdo(struct vdo *vdo)
+{
+	int result;
+
+	write_lock(&registry.lock);
+	result = ASSERT(filter_vdos_locked(vdo_is_equal, vdo) == NULL,
+			"VDO not already registered");
+	if (result == VDO_SUCCESS) {
+		INIT_LIST_HEAD(&vdo->registration);
+		list_add_tail(&vdo->registration, &registry.links);
+	}
+	write_unlock(&registry.lock);
+
+	return result;
+}
+
+/**
+ * initialize_vdo() - Do the portion of initializing a vdo which will clean up after itself on
+ *                    error.
+ * @vdo: The vdo being initialized
+ * @config: The configuration of the vdo
+ * @instance: The instance number of the vdo
+ * @reason: The buffer to hold the failure reason on error
+ */
+static int
+initialize_vdo(struct vdo *vdo, struct device_config *config, unsigned int instance, char **reason)
+{
+	int result;
+	zone_count_t i;
+
+	vdo->device_config = config;
+	vdo->starting_sector_offset = config->owning_target->begin;
+	vdo->instance = instance;
+	vdo->allocations_allowed = true;
+	vdo_set_admin_state_code(&vdo->admin.state, VDO_ADMIN_STATE_NEW);
+	INIT_LIST_HEAD(&vdo->device_config_list);
+	vdo_initialize_completion(&vdo->admin.completion, vdo, VDO_ADMIN_COMPLETION);
+	init_completion(&vdo->admin.callback_sync);
+	mutex_init(&vdo->stats_mutex);
+	result = read_geometry_block(vdo);
+	if (result != VDO_SUCCESS) {
+		*reason = "Could not load geometry block";
+		return result;
+	}
+
+	result = initialize_thread_config(config->thread_counts, &vdo->thread_config);
+	if (result != VDO_SUCCESS) {
+		*reason = "Cannot create thread configuration";
+		return result;
+	}
+
+	uds_log_info("zones: %d logical, %d physical, %d hash; total threads: %d",
+		     config->thread_counts.logical_zones,
+		     config->thread_counts.physical_zones,
+		     config->thread_counts.hash_zones,
+		     vdo->thread_config.thread_count);
+
+	/* Compression context storage */
+	result = UDS_ALLOCATE(config->thread_counts.cpu_threads,
+			      char *,
+			      "LZ4 context",
+			      &vdo->compression_context);
+	if (result != VDO_SUCCESS) {
+		*reason = "cannot allocate LZ4 context";
+		return result;
+	}
+
+	for (i = 0; i < config->thread_counts.cpu_threads; i++) {
+		result = UDS_ALLOCATE(LZ4_MEM_COMPRESS,
+				      char,
+				      "LZ4 context",
+				      &vdo->compression_context[i]);
+		if (result != VDO_SUCCESS) {
+			*reason = "cannot allocate LZ4 context";
+			return result;
+		}
+	}
+
+	result = register_vdo(vdo);
+	if (result != VDO_SUCCESS) {
+		*reason = "Cannot add VDO to device registry";
+		return result;
+	}
+
+	vdo_set_admin_state_code(&vdo->admin.state, VDO_ADMIN_STATE_INITIALIZED);
+	return result;
+}
+
+/**
+ * vdo_make() - Allocate and initialize a vdo.
+ * @instance: Device instantiation counter.
+ * @config: The device configuration.
+ * @reason: The reason for any failure during this call.
+ * @vdo_ptr: A pointer to hold the created vdo.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_make(unsigned int instance,
+	     struct device_config *config,
+	     char **reason,
+	     struct vdo **vdo_ptr)
+{
+	int result;
+	struct vdo *vdo;
+
+	/* VDO-3769 - Set a generic reason so we don't ever return garbage. */
+	*reason = "Unspecified error";
+
+	result = UDS_ALLOCATE(1, struct vdo, __func__, &vdo);
+	if (result != UDS_SUCCESS) {
+		*reason = "Cannot allocate VDO";
+		return result;
+	}
+
+	result = initialize_vdo(vdo, config, instance, reason);
+	if (result != VDO_SUCCESS) {
+		vdo_destroy(vdo);
+		return result;
+	}
+
+	/* From here on, the caller will clean up if there is an error. */
+	*vdo_ptr = vdo;
+
+	snprintf(vdo->thread_name_prefix,
+		 sizeof(vdo->thread_name_prefix),
+		 "%s%u",
+		 MODULE_NAME,
+		 instance);
+	BUG_ON(vdo->thread_name_prefix[0] == '\0');
+	result = UDS_ALLOCATE(vdo->thread_config.thread_count,
+			      struct vdo_thread,
+			      __func__,
+			      &vdo->threads);
+	if (result != VDO_SUCCESS) {
+		*reason = "Cannot allocate thread structures";
+		return result;
+	}
+
+	result = vdo_make_thread(vdo,
+				 vdo->thread_config.admin_thread,
+				 &default_queue_type,
+				 1,
+				 NULL);
+	if (result != VDO_SUCCESS) {
+		*reason = "Cannot make admin thread";
+		return result;
+	}
+
+	result = vdo_make_flusher(vdo);
+	if (result != VDO_SUCCESS) {
+		*reason = "Cannot make flusher zones";
+		return result;
+	}
+
+	result = vdo_make_packer(vdo, DEFAULT_PACKER_BINS, &vdo->packer);
+	if (result != VDO_SUCCESS) {
+		*reason = "Cannot make packer zones";
+		return result;
+	}
+
+	BUG_ON(vdo->device_config->logical_block_size <= 0);
+	BUG_ON(vdo->device_config->owned_device == NULL);
+	result = make_data_vio_pool(vdo,
+				    MAXIMUM_VDO_USER_VIOS,
+				    MAXIMUM_VDO_USER_VIOS * 3 / 4,
+				    &vdo->data_vio_pool);
+	if (result != VDO_SUCCESS) {
+		*reason = "Cannot allocate data_vio pool";
+		return result;
+	}
+
+	result = vdo_make_io_submitter(config->thread_counts.bio_threads,
+				       config->thread_counts.bio_rotation_interval,
+				       get_data_vio_pool_request_limit(vdo->data_vio_pool),
+				       vdo,
+				       &vdo->io_submitter);
+	if (result != VDO_SUCCESS) {
+		*reason = "bio submission initialization failed";
+		return result;
+	}
+
+	if (vdo_uses_bio_ack_queue(vdo)) {
+		result = vdo_make_thread(vdo,
+					 vdo->thread_config.bio_ack_thread,
+					 &bio_ack_q_type,
+					 config->thread_counts.bio_ack_threads,
+					 NULL);
+		if (result != VDO_SUCCESS) {
+			*reason = "bio ack queue initialization failed";
+			return result;
+		}
+	}
+
+	result = vdo_make_thread(vdo,
+				 vdo->thread_config.cpu_thread,
+				 &cpu_q_type,
+				 config->thread_counts.cpu_threads,
+				 (void **) vdo->compression_context);
+	if (result != VDO_SUCCESS) {
+		*reason = "CPU queue initialization failed";
+		return result;
+	}
+
+	return VDO_SUCCESS;
+}
+
+static void finish_vdo(struct vdo *vdo)
+{
+	int i;
+
+	if (vdo->threads == NULL)
+		return;
+
+	vdo_cleanup_io_submitter(vdo->io_submitter);
+	vdo_finish_dedupe_index(vdo->hash_zones);
+
+	for (i = 0; i < vdo->thread_config.thread_count; i++)
+		vdo_finish_work_queue(vdo->threads[i].queue);
+}
+
+/**
+ * free_listeners() - Free the list of read-only listeners associated with a thread.
+ * @thread_data: The thread holding the list to free.
+ */
+static void free_listeners(struct vdo_thread *thread)
+{
+	struct read_only_listener *listener, *next;
+
+	for (listener = UDS_FORGET(thread->listeners); listener != NULL; listener = next) {
+		next = UDS_FORGET(listener->next);
+		UDS_FREE(listener);
+	}
+}
+
+static void uninitialize_super_block(struct vdo_super_block *super_block)
+{
+	free_vio_components(&super_block->vio);
+	UDS_FREE(super_block->buffer);
+}
+
+/**
+ * unregister_vdo() - Remove a vdo from the device registry.
+ * @vdo: The vdo to remove.
+ */
+static void unregister_vdo(struct vdo *vdo)
+{
+	write_lock(&registry.lock);
+	if (filter_vdos_locked(vdo_is_equal, vdo) == vdo)
+		list_del_init(&vdo->registration);
+
+	write_unlock(&registry.lock);
+}
+
+/**
+ * vdo_destroy() - Destroy a vdo instance.
+ * @vdo: The vdo to destroy (may be NULL).
+ */
+void vdo_destroy(struct vdo *vdo)
+{
+	unsigned int i;
+
+	if (vdo == NULL)
+		return;
+
+	/* A running VDO should never be destroyed without suspending first. */
+	BUG_ON(vdo_get_admin_state(vdo)->normal);
+
+	vdo->allocations_allowed = true;
+
+	/* Stop services that need to gather VDO statistics from the worker threads. */
+	if (vdo->sysfs_added) {
+		init_completion(&vdo->stats_shutdown);
+		kobject_put(&vdo->stats_directory);
+		wait_for_completion(&vdo->stats_shutdown);
+	}
+
+	finish_vdo(vdo);
+	unregister_vdo(vdo);
+	free_data_vio_pool(vdo->data_vio_pool);
+	vdo_free_io_submitter(UDS_FORGET(vdo->io_submitter));
+	vdo_free_flusher(UDS_FORGET(vdo->flusher));
+	vdo_free_packer(UDS_FORGET(vdo->packer));
+	vdo_free_recovery_journal(UDS_FORGET(vdo->recovery_journal));
+	vdo_free_slab_depot(UDS_FORGET(vdo->depot));
+	vdo_uninitialize_layout(&vdo->layout);
+	vdo_uninitialize_layout(&vdo->next_layout);
+	if (vdo->partition_copier)
+		dm_kcopyd_client_destroy(UDS_FORGET(vdo->partition_copier));
+	uninitialize_super_block(&vdo->super_block);
+	vdo_free_block_map(UDS_FORGET(vdo->block_map));
+	vdo_free_hash_zones(UDS_FORGET(vdo->hash_zones));
+	vdo_free_physical_zones(UDS_FORGET(vdo->physical_zones));
+	vdo_free_logical_zones(UDS_FORGET(vdo->logical_zones));
+
+	if (vdo->threads != NULL) {
+		for (i = 0; i < vdo->thread_config.thread_count; i++) {
+			free_listeners(&vdo->threads[i]);
+			vdo_free_work_queue(UDS_FORGET(vdo->threads[i].queue));
+		}
+		UDS_FREE(UDS_FORGET(vdo->threads));
+	}
+
+	uninitialize_thread_config(&vdo->thread_config);
+
+	if (vdo->compression_context != NULL) {
+		for (i = 0; i < vdo->device_config->thread_counts.cpu_threads; i++)
+			UDS_FREE(UDS_FORGET(vdo->compression_context[i]));
+
+		UDS_FREE(UDS_FORGET(vdo->compression_context));
+	}
+
+	/*
+	 * The call to kobject_put on the kobj sysfs node will decrement its reference count; when
+	 * the count goes to zero the VDO object will be freed as a side effect.
+	 */
+	if (!vdo->sysfs_added)
+		UDS_FREE(vdo);
+	else
+		kobject_put(&vdo->vdo_directory);
+}
+
+static int initialize_super_block(struct vdo *vdo, struct vdo_super_block *super_block)
+{
+	int result;
+
+	result = UDS_ALLOCATE(VDO_BLOCK_SIZE,
+			      char,
+			      "encoded super block",
+			      (char **) &vdo->super_block.buffer);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	return allocate_vio_components(vdo,
+				       VIO_TYPE_SUPER_BLOCK,
+				       VIO_PRIORITY_METADATA,
+				       NULL,
+				       1,
+				       (char *) super_block->buffer,
+				       &vdo->super_block.vio);
+}
+
+/**
+ * finish_reading_super_block() - Continue after loading the super block.
+ * @completion: The super block vio.
+ *
+ * This callback is registered in vdo_load_super_block().
+ */
+static void finish_reading_super_block(struct vdo_completion *completion)
+{
+	struct vdo_super_block *super_block =
+		container_of(as_vio(completion), struct vdo_super_block, vio);
+
+	vdo_continue_completion(UDS_FORGET(completion->parent),
+				vdo_decode_super_block(super_block->buffer));
+}
+
+/**
+ * handle_super_block_read_error() - Handle an error reading the super block.
+ * @completion: The super block vio.
+ *
+ * This error handler is registered in vdo_load_super_block().
+ */
+static void handle_super_block_read_error(struct vdo_completion *completion)
+{
+	vio_record_metadata_io_error(as_vio(completion));
+	finish_reading_super_block(completion);
+}
+
+static void read_super_block_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct vdo_completion *parent = vio->completion.parent;
+
+	continue_vio_after_io(vio, finish_reading_super_block, parent->callback_thread_id);
+}
+
+/**
+ * vdo_load_super_block() - Allocate a super block and read its contents from storage.
+ * @vdo: The vdo containing the super block on disk.
+ * @parent: The completion to notify after loading the super block.
+ */
+void vdo_load_super_block(struct vdo *vdo, struct vdo_completion *parent)
+{
+	int result;
+
+	result = initialize_super_block(vdo, &vdo->super_block);
+	if (result != VDO_SUCCESS) {
+		vdo_continue_completion(parent, result);
+		return;
+	}
+
+	vdo->super_block.vio.completion.parent = parent;
+	submit_metadata_vio(&vdo->super_block.vio,
+			    vdo_get_data_region_start(vdo->geometry),
+			    read_super_block_endio,
+			    handle_super_block_read_error,
+			    REQ_OP_READ);
+}
+
+/**
+ * pool_stats_release() - Signal that sysfs stats have been shut down.
+ * @directory: The vdo stats directory.
+ */
+static void pool_stats_release(struct kobject *directory)
+{
+	struct vdo *vdo = container_of(directory, struct vdo, stats_directory);
+
+	complete(&vdo->stats_shutdown);
+}
+
+ATTRIBUTE_GROUPS(vdo_pool_stats);
+static struct kobj_type stats_directory_type = {
+	.release = pool_stats_release,
+	.sysfs_ops = &vdo_pool_stats_sysfs_ops,
+	.default_groups = vdo_pool_stats_groups,
+};
+
+/**
+ * vdo_add_sysfs_stats_dir() - Add the stats directory to the vdo sysfs directory.
+ * @vdo: The vdo.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_add_sysfs_stats_dir(struct vdo *vdo)
+{
+	int result;
+
+	kobject_init(&vdo->stats_directory, &stats_directory_type);
+	result = kobject_add(&vdo->stats_directory, &vdo->vdo_directory, "statistics");
+	if (result != 0)
+		return VDO_CANT_ADD_SYSFS_NODE;
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_get_backing_device() - Get the block device object underlying a vdo.
+ * @vdo: The vdo.
+ *
+ * Return: The vdo's current block device.
+ */
+struct block_device *vdo_get_backing_device(const struct vdo *vdo)
+{
+	return vdo->device_config->owned_device->bdev;
+}
+
+/**
+ * vdo_get_device_name() - Get the device name associated with the vdo target.
+ * @target: The target device interface.
+ *
+ * Return: The block device name.
+ */
+const char *vdo_get_device_name(const struct dm_target *target)
+{
+	return dm_device_name(dm_table_get_md(target->table));
+}
+
+/**
+ * vdo_synchronous_flush() - Issue a flush request and wait for it to complete.
+ * @vdo: The vdo.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_synchronous_flush(struct vdo *vdo)
+{
+	int result;
+	struct bio bio;
+
+	bio_init(&bio, vdo_get_backing_device(vdo), 0, 0,
+		 REQ_OP_WRITE | REQ_PREFLUSH);
+	submit_bio_wait(&bio);
+	result = blk_status_to_errno(bio.bi_status);
+
+	atomic64_inc(&vdo->stats.flush_out);
+	if (result != 0) {
+		uds_log_error_strerror(result, "synchronous flush failed");
+		result = -EIO;
+	}
+
+	bio_uninit(&bio);
+	return result;
+}
+
+/**
+ * vdo_get_state() - Get the current state of the vdo.
+ * @vdo: The vdo.
+
+ * Context: This method may be called from any thread.
+ *
+ * Return: The current state of the vdo.
+ */
+enum vdo_state vdo_get_state(const struct vdo *vdo)
+{
+	enum vdo_state state = atomic_read(&vdo->state);
+
+	/* pairs with barriers where state field is changed */
+	smp_rmb();
+	return state;
+}
+
+/**
+ * vdo_set_state() - Set the current state of the vdo.
+ * @vdo: The vdo whose state is to be set.
+ * @state: The new state of the vdo.
+ *
+ * Context: This method may be called from any thread.
+ */
+void vdo_set_state(struct vdo *vdo, enum vdo_state state)
+{
+	/* pairs with barrier in vdo_get_state */
+	smp_wmb();
+	atomic_set(&vdo->state, state);
+}
+
+/**
+ * vdo_get_admin_state() - Get the admin state of the vdo.
+ * @vdo: The vdo.
+ *
+ * Return: The code for the vdo's current admin state.
+ */
+const struct admin_state_code *vdo_get_admin_state(const struct vdo *vdo)
+{
+	return vdo_get_admin_state_code(&vdo->admin.state);
+}
+
+/**
+ * record_vdo() - Record the state of the VDO for encoding in the super block.
+ */
+static void record_vdo(struct vdo *vdo)
+{
+	vdo->states.release_version = vdo->geometry.release_version;
+	vdo->states.vdo.state = vdo_get_state(vdo);
+	vdo->states.block_map = vdo_record_block_map(vdo->block_map);
+	vdo->states.recovery_journal = vdo_record_recovery_journal(vdo->recovery_journal);
+	vdo->states.slab_depot = vdo_record_slab_depot(vdo->depot);
+	vdo->states.layout = vdo->layout;
+}
+
+/**
+ * continue_super_block_parent() - Continue the parent of a super block save operation.
+ * @completion: The super block vio.
+ *
+ * This callback is registered in vdo_save_components().
+ */
+static void continue_super_block_parent(struct vdo_completion *completion)
+{
+	vdo_continue_completion(UDS_FORGET(completion->parent), completion->result);
+}
+
+/**
+ * handle_save_error() - Log a super block save error.
+ * @completion: The super block vio.
+ *
+ * This error handler is registered in vdo_save_components().
+ */
+static void handle_save_error(struct vdo_completion *completion)
+{
+	struct vdo_super_block *super_block =
+		container_of(as_vio(completion), struct vdo_super_block, vio);
+
+	vio_record_metadata_io_error(&super_block->vio);
+	uds_log_error_strerror(completion->result, "super block save failed");
+	/*
+	 * Mark the super block as unwritable so that we won't attempt to write it again. This
+	 * avoids the case where a growth attempt fails writing the super block with the new size,
+	 * but the subsequent attempt to write out the read-only state succeeds. In this case,
+	 * writes which happened just before the suspend would not be visible if the VDO is
+	 * restarted without rebuilding, but, after a read-only rebuild, the effects of those
+	 * writes would reappear.
+	 */
+	super_block->unwriteable = true;
+	completion->callback(completion);
+}
+
+static void super_block_write_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct vdo_completion *parent = vio->completion.parent;
+
+	continue_vio_after_io(vio, continue_super_block_parent, parent->callback_thread_id);
+}
+
+/**
+ * vdo_save_components() - Encode the vdo and save the super block asynchronously.
+ * @vdo: The vdo whose state is being saved.
+ * @parent: The completion to notify when the save is complete.
+ */
+void vdo_save_components(struct vdo *vdo, struct vdo_completion *parent)
+{
+	struct vdo_super_block *super_block = &vdo->super_block;
+
+	if (super_block->unwriteable) {
+		vdo_continue_completion(parent, VDO_READ_ONLY);
+		return;
+	}
+
+	if (super_block->vio.completion.parent != NULL) {
+		vdo_continue_completion(parent, VDO_COMPONENT_BUSY);
+		return;
+	}
+
+	record_vdo(vdo);
+
+	vdo_encode_super_block(super_block->buffer, &vdo->states);
+	super_block->vio.completion.parent = parent;
+	super_block->vio.completion.callback_thread_id = parent->callback_thread_id;
+	submit_metadata_vio(&super_block->vio,
+			    vdo_get_data_region_start(vdo->geometry),
+			    super_block_write_endio,
+			    handle_save_error,
+			    REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA);
+}
+
+/**
+ * vdo_register_read_only_listener() - Register a listener to be notified when the VDO goes
+ *                                     read-only.
+ * @vdo: The vdo to register with.
+ * @listener: The object to notify.
+ * @notification: The function to call to send the notification.
+ * @thread_id: The id of the thread on which to send the notification.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_register_read_only_listener(struct vdo *vdo,
+				    void *listener,
+				    vdo_read_only_notification *notification,
+				    thread_id_t thread_id)
+{
+	struct vdo_thread *thread = &vdo->threads[thread_id];
+	struct read_only_listener *read_only_listener;
+	int result;
+
+	result = ASSERT(thread_id != vdo->thread_config.dedupe_thread,
+			"read only listener not registered on dedupe thread");
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = UDS_ALLOCATE(1, struct read_only_listener, __func__, &read_only_listener);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	*read_only_listener = (struct read_only_listener) {
+		.listener = listener,
+		.notify = notification,
+		.next = thread->listeners,
+	};
+
+	thread->listeners = read_only_listener;
+	return VDO_SUCCESS;
+}
+
+/**
+ * notify_vdo_of_read_only_mode() - Notify a vdo that it is going read-only.
+ * @listener: The vdo.
+ * @parent: The completion to notify in order to acknowledge the notification.
+ *
+ * This will save the read-only state to the super block.
+ *
+ * Implements vdo_read_only_notification.
+ */
+static void notify_vdo_of_read_only_mode(void *listener, struct vdo_completion *parent)
+{
+	struct vdo *vdo = listener;
+
+	if (vdo_in_read_only_mode(vdo))
+		vdo_finish_completion(parent);
+
+	vdo_set_state(vdo, VDO_READ_ONLY_MODE);
+	vdo_save_components(vdo, parent);
+}
+
+/**
+ * vdo_enable_read_only_entry() - Enable a vdo to enter read-only mode on errors.
+ * @vdo: The vdo to enable.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_enable_read_only_entry(struct vdo *vdo)
+{
+	thread_id_t id;
+	bool is_read_only = vdo_in_read_only_mode(vdo);
+	struct read_only_notifier *notifier = &vdo->read_only_notifier;
+
+	if (is_read_only) {
+		notifier->read_only_error = VDO_READ_ONLY;
+		notifier->state = NOTIFIED;
+	} else {
+		notifier->state = MAY_NOT_NOTIFY;
+	}
+
+	spin_lock_init(&notifier->lock);
+	vdo_initialize_completion(&notifier->completion, vdo, VDO_READ_ONLY_MODE_COMPLETION);
+
+	for (id = 0; id < vdo->thread_config.thread_count; id++)
+		vdo->threads[id].is_read_only = is_read_only;
+
+	return vdo_register_read_only_listener(vdo,
+					       vdo,
+					       notify_vdo_of_read_only_mode,
+					       vdo->thread_config.admin_thread);
+}
+
+/**
+ * vdo_wait_until_not_entering_read_only_mode() - Wait until no read-only notifications are in
+ *                                                progress and prevent any subsequent
+ *                                                notifications.
+ * @parent: The completion to notify when no threads are entering read-only mode.
+ *
+ * Notifications may be re-enabled by calling vdo_allow_read_only_mode_entry().
+ */
+void vdo_wait_until_not_entering_read_only_mode(struct vdo_completion *parent)
+{
+	struct vdo *vdo = parent->vdo;
+	struct read_only_notifier *notifier = &vdo->read_only_notifier;
+
+	vdo_assert_on_admin_thread(vdo, __func__);
+
+	if (notifier->waiter != NULL) {
+		vdo_continue_completion(parent, VDO_COMPONENT_BUSY);
+		return;
+	}
+
+	spin_lock(&notifier->lock);
+	if (notifier->state == NOTIFYING)
+		notifier->waiter = parent;
+	else if (notifier->state == MAY_NOTIFY)
+		notifier->state = MAY_NOT_NOTIFY;
+	spin_unlock(&notifier->lock);
+
+	if (notifier->waiter == NULL) {
+		/*
+		 * A notification was not in progress, and now they are
+		 * disallowed.
+		 */
+		vdo_launch_completion(parent);
+		return;
+	}
+}
+
+/**
+ * as_notifier() - Convert a generic vdo_completion to a read_only_notifier.
+ * @completion: The completion to convert.
+ *
+ * Return: The completion as a read_only_notifier.
+ */
+static inline struct read_only_notifier *as_notifier(struct vdo_completion *completion)
+{
+	vdo_assert_completion_type(completion, VDO_READ_ONLY_MODE_COMPLETION);
+	return container_of(completion, struct read_only_notifier, completion);
+}
+
+/**
+ * finish_entering_read_only_mode() - Complete the process of entering read only mode.
+ * @completion: The read-only mode completion.
+ */
+static void finish_entering_read_only_mode(struct vdo_completion *completion)
+{
+	struct read_only_notifier *notifier = as_notifier(completion);
+
+	vdo_assert_on_admin_thread(completion->vdo, __func__);
+
+	spin_lock(&notifier->lock);
+	notifier->state = NOTIFIED;
+	spin_unlock(&notifier->lock);
+
+	if (notifier->waiter != NULL)
+		vdo_continue_completion(UDS_FORGET(notifier->waiter), completion->result);
+}
+
+/**
+ * make_thread_read_only() - Inform each thread that the VDO is in read-only mode.
+ * @completion: The read-only mode completion.
+ */
+static void make_thread_read_only(struct vdo_completion *completion)
+{
+	struct vdo *vdo = completion->vdo;
+	thread_id_t thread_id = completion->callback_thread_id;
+	struct read_only_notifier *notifier = as_notifier(completion);
+	struct read_only_listener *listener = completion->parent;
+
+	if (listener == NULL) {
+		/* This is the first call on this thread */
+		struct vdo_thread *thread = &vdo->threads[thread_id];
+
+		thread->is_read_only = true;
+		listener = thread->listeners;
+		if (thread_id == 0)
+			uds_log_error_strerror(READ_ONCE(notifier->read_only_error),
+					       "Unrecoverable error, entering read-only mode");
+	} else {
+		/* We've just finished notifying a listener */
+		listener = listener->next;
+	}
+
+	if (listener != NULL) {
+		/* We have a listener to notify */
+		vdo_prepare_completion(completion,
+				       make_thread_read_only,
+				       make_thread_read_only,
+				       thread_id,
+				       listener);
+		listener->notify(listener->listener, completion);
+		return;
+	}
+
+	/* We're done with this thread */
+	if (++thread_id == vdo->thread_config.dedupe_thread)
+		/*
+		 * We don't want to notify the dedupe thread since it may be
+		 * blocked rebuilding the index.
+		 */
+		++thread_id;
+
+	if (thread_id >= vdo->thread_config.thread_count)
+		/* There are no more threads */
+		vdo_prepare_completion(completion,
+				       finish_entering_read_only_mode,
+				       finish_entering_read_only_mode,
+				       vdo->thread_config.admin_thread,
+				       NULL);
+	else
+		vdo_prepare_completion(completion,
+				       make_thread_read_only,
+				       make_thread_read_only,
+				       thread_id,
+				       NULL);
+
+	vdo_launch_completion(completion);
+}
+
+/**
+ * vdo_allow_read_only_mode_entry() - Allow the notifier to put the VDO into read-only mode,
+ *                                    reversing the effects of
+ *                                    vdo_wait_until_not_entering_read_only_mode().
+ * @parent: The object to notify once the operation is complete.
+ *
+ * If some thread tried to put the vdo into read-only mode while notifications were disallowed, it
+ * will be done when this method is called. If that happens, the parent will not be notified until
+ * the vdo has actually entered read-only mode and attempted to save the super block.
+ *
+ * Context: This method may only be called from the admin thread.
+ */
+void vdo_allow_read_only_mode_entry(struct vdo_completion *parent)
+{
+	struct vdo *vdo = parent->vdo;
+	struct read_only_notifier *notifier = &vdo->read_only_notifier;
+
+	vdo_assert_on_admin_thread(vdo, __func__);
+
+	if (notifier->waiter != NULL) {
+		vdo_continue_completion(parent, VDO_COMPONENT_BUSY);
+		return;
+	}
+
+	spin_lock(&notifier->lock);
+	if (notifier->state == MAY_NOT_NOTIFY) {
+		if (notifier->read_only_error == VDO_SUCCESS) {
+			notifier->state = MAY_NOTIFY;
+		} else {
+			notifier->state = NOTIFYING;
+			notifier->waiter = parent;
+		}
+	}
+	spin_unlock(&notifier->lock);
+
+	if (notifier->waiter == NULL) {
+		/* We're done */
+		vdo_launch_completion(parent);
+		return;
+	}
+
+	/* Do the pending notification. */
+	make_thread_read_only(&notifier->completion);
+}
+
+/**
+ * vdo_enter_read_only_mode() - Put a VDO into read-only mode and save the read-only state in the
+ *                              super block.
+ * @vdo: The vdo.
+ * @error_code: The error which caused the VDO to enter read-only mode.
+ *
+ * This method is a no-op if the VDO is already read-only.
+ */
+void vdo_enter_read_only_mode(struct vdo *vdo, int error_code)
+{
+	bool notify = false;
+	thread_id_t thread_id = vdo_get_callback_thread_id();
+	struct read_only_notifier *notifier = &vdo->read_only_notifier;
+	struct vdo_thread *thread;
+
+	if (thread_id != VDO_INVALID_THREAD_ID) {
+		thread = &vdo->threads[thread_id];
+		if (thread->is_read_only)
+			/* This thread has already gone read-only. */
+			return;
+
+		/* Record for this thread that the VDO is read-only. */
+		thread->is_read_only = true;
+	}
+
+	spin_lock(&notifier->lock);
+	if (notifier->read_only_error == VDO_SUCCESS) {
+		WRITE_ONCE(notifier->read_only_error, error_code);
+		if (notifier->state == MAY_NOTIFY) {
+			notifier->state = NOTIFYING;
+			notify = true;
+		}
+	}
+	spin_unlock(&notifier->lock);
+
+	if (!notify)
+		/* The notifier is already aware of a read-only error */
+		return;
+
+	/* Initiate a notification starting on the lowest numbered thread. */
+	vdo_launch_completion_callback(&notifier->completion, make_thread_read_only, 0);
+}
+
+/**
+ * vdo_is_read_only() - Check whether the VDO is read-only.
+ * @vdo: The vdo.
+ *
+ * Return: true if the vdo is read-only.
+ *
+ * This method may be called from any thread, as opposed to examining the VDO's state field which
+ * is only safe to check from the admin thread.
+ */
+bool vdo_is_read_only(struct vdo *vdo)
+{
+	return vdo->threads[vdo_get_callback_thread_id()].is_read_only;
+}
+
+/**
+ * vdo_in_read_only_mode() - Check whether a vdo is in read-only mode.
+ * @vdo: The vdo to query.
+ *
+ * Return: true if the vdo is in read-only mode.
+ */
+bool vdo_in_read_only_mode(const struct vdo *vdo)
+{
+	return (vdo_get_state(vdo) == VDO_READ_ONLY_MODE);
+}
+
+/**
+ * vdo_in_recovery_mode() - Check whether the vdo is in recovery mode.
+ * @vdo: The vdo to query.
+ *
+ * Return: true if the vdo is in recovery mode.
+ */
+bool vdo_in_recovery_mode(const struct vdo *vdo)
+{
+	return (vdo_get_state(vdo) == VDO_RECOVERING);
+}
+
+/**
+ * vdo_enter_recovery_mode() - Put the vdo into recovery mode.
+ * @vdo: The vdo.
+ */
+void vdo_enter_recovery_mode(struct vdo *vdo)
+{
+	vdo_assert_on_admin_thread(vdo, __func__);
+
+	if (vdo_in_read_only_mode(vdo))
+		return;
+
+	uds_log_info("Entering recovery mode");
+	vdo_set_state(vdo, VDO_RECOVERING);
+}
+
+/**
+ * complete_synchronous_action() - Signal the waiting thread that a synchronous action is complete.
+ * @completion: The sync completion.
+ */
+static void complete_synchronous_action(struct vdo_completion *completion)
+{
+	vdo_assert_completion_type(completion, VDO_SYNC_COMPLETION);
+	complete(&(container_of(completion, struct sync_completion, vdo_completion)->completion));
+}
+
+/**
+ * perform_synchronous_action() - Launch an action on a VDO thread and wait for it to complete.
+ * @vdo: The vdo.
+ * @action: The callback to launch.
+ * @thread_id: The thread on which to run the action.
+ * @parent: The parent of the sync completion (may be NULL).
+ */
+static int perform_synchronous_action(struct vdo *vdo,
+				      vdo_action *action,
+				      thread_id_t thread_id,
+				      void *parent)
+{
+	struct sync_completion sync;
+
+	vdo_initialize_completion(&sync.vdo_completion, vdo, VDO_SYNC_COMPLETION);
+	init_completion(&sync.completion);
+	sync.vdo_completion.parent = parent;
+	vdo_launch_completion_callback(&sync.vdo_completion, action, thread_id);
+	wait_for_completion(&sync.completion);
+	return sync.vdo_completion.result;
+}
+
+/**
+ * set_compression_callback() - Callback to turn compression on or off.
+ * @completion: The completion.
+ */
+static void set_compression_callback(struct vdo_completion *completion)
+{
+	struct vdo *vdo = completion->vdo;
+	bool *enable = completion->parent;
+	bool was_enabled = vdo_get_compressing(vdo);
+
+	if (*enable != was_enabled) {
+		WRITE_ONCE(vdo->compressing, *enable);
+		if (was_enabled)
+			/* Signal the packer to flush since compression has been disabled. */
+			vdo_flush_packer(vdo->packer);
+	}
+
+	uds_log_info("compression is %s", (*enable ? "enabled" : "disabled"));
+	*enable = was_enabled;
+	complete_synchronous_action(completion);
+}
+
+/**
+ * vdo_set_compressing() - Turn compression on or off.
+ * @vdo: The vdo.
+ * @enable: Whether to enable or disable compression.
+ *
+ * Return: Whether compression was previously on or off.
+ */
+bool vdo_set_compressing(struct vdo *vdo, bool enable)
+{
+	perform_synchronous_action(vdo,
+				   set_compression_callback,
+				   vdo->thread_config.packer_thread,
+				   &enable);
+	return enable;
+}
+
+/**
+ * vdo_get_compressing() - Get whether compression is enabled in a vdo.
+ * @vdo: The vdo.
+ *
+ * Return: State of compression.
+ */
+bool vdo_get_compressing(struct vdo *vdo)
+{
+	return READ_ONCE(vdo->compressing);
+}
+
+static size_t get_block_map_cache_size(const struct vdo *vdo)
+{
+	return ((size_t) vdo->device_config->cache_size) * VDO_BLOCK_SIZE;
+}
+
+static struct error_statistics __must_check get_vdo_error_statistics(const struct vdo *vdo)
+{
+	/*
+	 * The error counts can be incremented from arbitrary threads and so must be incremented
+	 * atomically, but they are just statistics with no semantics that could rely on memory
+	 * order, so unfenced reads are sufficient.
+	 */
+	const struct atomic_statistics *atoms = &vdo->stats;
+
+	return (struct error_statistics) {
+		.invalid_advice_pbn_count = atomic64_read(&atoms->invalid_advice_pbn_count),
+		.no_space_error_count = atomic64_read(&atoms->no_space_error_count),
+		.read_only_error_count = atomic64_read(&atoms->read_only_error_count),
+	};
+}
+
+static void copy_bio_stat(struct bio_stats *b, const struct atomic_bio_stats *a)
+{
+	b->read = atomic64_read(&a->read);
+	b->write = atomic64_read(&a->write);
+	b->discard = atomic64_read(&a->discard);
+	b->flush = atomic64_read(&a->flush);
+	b->empty_flush = atomic64_read(&a->empty_flush);
+	b->fua = atomic64_read(&a->fua);
+}
+
+static struct bio_stats subtract_bio_stats(struct bio_stats minuend, struct bio_stats subtrahend)
+{
+	return (struct bio_stats) {
+		.read = minuend.read - subtrahend.read,
+		.write = minuend.write - subtrahend.write,
+		.discard = minuend.discard - subtrahend.discard,
+		.flush = minuend.flush - subtrahend.flush,
+		.empty_flush = minuend.empty_flush - subtrahend.empty_flush,
+		.fua = minuend.fua - subtrahend.fua,
+	};
+}
+
+/**
+ * vdo_get_physical_blocks_allocated() - Get the number of physical blocks in use by user data.
+ * @vdo: The vdo.
+ *
+ * Return: The number of blocks allocated for user data.
+ */
+static block_count_t __must_check vdo_get_physical_blocks_allocated(const struct vdo *vdo)
+{
+	return (vdo_get_slab_depot_allocated_blocks(vdo->depot) -
+		vdo_get_journal_block_map_data_blocks_used(vdo->recovery_journal));
+}
+
+/**
+ * vdo_get_physical_blocks_overhead() - Get the number of physical blocks used by vdo metadata.
+ * @vdo: The vdo.
+ *
+ * Return: The number of overhead blocks.
+ */
+static block_count_t __must_check vdo_get_physical_blocks_overhead(const struct vdo *vdo)
+{
+	/*
+	 * config.physical_blocks is mutated during resize and is in a packed structure,
+	 * but resize runs on admin thread.
+	 * TODO: Verify that this is always safe.
+	 */
+	return (vdo->states.vdo.config.physical_blocks -
+		vdo_get_slab_depot_data_blocks(vdo->depot) +
+		vdo_get_journal_block_map_data_blocks_used(vdo->recovery_journal));
+}
+
+static const char *vdo_describe_state(enum vdo_state state)
+{
+	/* These strings should all fit in the 15 chars of VDOStatistics.mode. */
+	switch (state) {
+	case VDO_RECOVERING:
+		return "recovering";
+
+	case VDO_READ_ONLY_MODE:
+		return "read-only";
+
+	default:
+		return "normal";
+	}
+}
+
+/**
+ * get_vdo_statistics() - Populate a vdo_statistics structure on the admin thread.
+ * @vdo: The vdo.
+ * @stats: The statistics structure to populate.
+ */
+static void get_vdo_statistics(const struct vdo *vdo, struct vdo_statistics *stats)
+{
+	struct recovery_journal *journal = vdo->recovery_journal;
+	enum vdo_state state = vdo_get_state(vdo);
+
+	vdo_assert_on_admin_thread(vdo, __func__);
+
+	/* start with a clean slate */
+	memset(stats, 0, sizeof(struct vdo_statistics));
+
+	/*
+	 * These are immutable properties of the vdo object, so it is safe to query them from any
+	 * thread.
+	 */
+	stats->version = STATISTICS_VERSION;
+	stats->release_version = VDO_CURRENT_RELEASE_VERSION_NUMBER;
+	stats->logical_blocks = vdo->states.vdo.config.logical_blocks;
+	/*
+	 * config.physical_blocks is mutated during resize and is in a packed structure, but resize
+	 * runs on the admin thread.
+	 * TODO: verify that this is always safe
+	 */
+	stats->physical_blocks = vdo->states.vdo.config.physical_blocks;
+	stats->block_size = VDO_BLOCK_SIZE;
+	stats->complete_recoveries = vdo->states.vdo.complete_recoveries;
+	stats->read_only_recoveries = vdo->states.vdo.read_only_recoveries;
+	stats->block_map_cache_size = get_block_map_cache_size(vdo);
+
+	/* The callees are responsible for thread-safety. */
+	stats->data_blocks_used = vdo_get_physical_blocks_allocated(vdo);
+	stats->overhead_blocks_used = vdo_get_physical_blocks_overhead(vdo);
+	stats->logical_blocks_used = vdo_get_recovery_journal_logical_blocks_used(journal);
+	vdo_get_slab_depot_statistics(vdo->depot, stats);
+	stats->journal = vdo_get_recovery_journal_statistics(journal);
+	stats->packer = vdo_get_packer_statistics(vdo->packer);
+	stats->block_map = vdo_get_block_map_statistics(vdo->block_map);
+	vdo_get_dedupe_statistics(vdo->hash_zones, stats);
+	stats->errors = get_vdo_error_statistics(vdo);
+	stats->in_recovery_mode = (state == VDO_RECOVERING);
+	snprintf(stats->mode, sizeof(stats->mode), "%s", vdo_describe_state(state));
+
+	stats->instance = vdo->instance;
+	stats->current_vios_in_progress = get_data_vio_pool_active_requests(vdo->data_vio_pool);
+	stats->max_vios = get_data_vio_pool_maximum_requests(vdo->data_vio_pool);
+
+	stats->flush_out = atomic64_read(&vdo->stats.flush_out);
+	stats->logical_block_size = vdo->device_config->logical_block_size;
+	copy_bio_stat(&stats->bios_in, &vdo->stats.bios_in);
+	copy_bio_stat(&stats->bios_in_partial, &vdo->stats.bios_in_partial);
+	copy_bio_stat(&stats->bios_out, &vdo->stats.bios_out);
+	copy_bio_stat(&stats->bios_meta, &vdo->stats.bios_meta);
+	copy_bio_stat(&stats->bios_journal, &vdo->stats.bios_journal);
+	copy_bio_stat(&stats->bios_page_cache, &vdo->stats.bios_page_cache);
+	copy_bio_stat(&stats->bios_out_completed, &vdo->stats.bios_out_completed);
+	copy_bio_stat(&stats->bios_meta_completed, &vdo->stats.bios_meta_completed);
+	copy_bio_stat(&stats->bios_journal_completed, &vdo->stats.bios_journal_completed);
+	copy_bio_stat(&stats->bios_page_cache_completed, &vdo->stats.bios_page_cache_completed);
+	copy_bio_stat(&stats->bios_acknowledged, &vdo->stats.bios_acknowledged);
+	copy_bio_stat(&stats->bios_acknowledged_partial, &vdo->stats.bios_acknowledged_partial);
+	stats->bios_in_progress = subtract_bio_stats(stats->bios_in, stats->bios_acknowledged);
+	uds_get_memory_stats(&stats->memory_usage.bytes_used,
+			     &stats->memory_usage.peak_bytes_used);
+}
+
+/**
+ * vdo_fetch_statistics_callback() - Action to populate a vdo_statistics
+ *                                   structure on the admin thread.
+ * @completion: The completion.
+ *
+ * This callback is registered in vdo_fetch_statistics().
+ */
+static void vdo_fetch_statistics_callback(struct vdo_completion *completion)
+{
+	get_vdo_statistics(completion->vdo, completion->parent);
+	complete_synchronous_action(completion);
+}
+
+/**
+ * vdo_fetch_statistics() - Fetch statistics on the correct thread.
+ * @vdo: The vdo.
+ * @stats: The vdo statistics are returned here.
+ */
+void vdo_fetch_statistics(struct vdo *vdo, struct vdo_statistics *stats)
+{
+	perform_synchronous_action(vdo,
+				   vdo_fetch_statistics_callback,
+				   vdo->thread_config.admin_thread,
+				   stats);
+}
+
+/**
+ * vdo_get_callback_thread_id() - Get the id of the callback thread on which a completion is
+ *                                currently running.
+ *
+ * Return: The current thread ID, or -1 if no such thread.
+ */
+thread_id_t vdo_get_callback_thread_id(void)
+{
+	struct vdo_work_queue *queue = vdo_get_current_work_queue();
+	struct vdo_thread *thread;
+	thread_id_t thread_id;
+
+	if (queue == NULL)
+		return VDO_INVALID_THREAD_ID;
+
+	thread = vdo_get_work_queue_owner(queue);
+	thread_id = thread->thread_id;
+
+	if (PARANOID_THREAD_CONSISTENCY_CHECKS) {
+		BUG_ON(thread_id >= thread->vdo->thread_config.thread_count);
+		BUG_ON(thread != &thread->vdo->threads[thread_id]);
+	}
+
+	return thread_id;
+}
+
+/**
+ * vdo_dump_status() - Dump status information about a vdo to the log for debugging.
+ * @vdo: The vdo to dump.
+ */
+void vdo_dump_status(const struct vdo *vdo)
+{
+	zone_count_t zone;
+
+	vdo_dump_flusher(vdo->flusher);
+	vdo_dump_recovery_journal_statistics(vdo->recovery_journal);
+	vdo_dump_packer(vdo->packer);
+	vdo_dump_slab_depot(vdo->depot);
+
+	for (zone = 0; zone < vdo->thread_config.logical_zone_count; zone++)
+		vdo_dump_logical_zone(&vdo->logical_zones->zones[zone]);
+
+	for (zone = 0; zone < vdo->thread_config.physical_zone_count; zone++)
+		vdo_dump_physical_zone(&vdo->physical_zones->zones[zone]);
+
+	vdo_dump_hash_zones(vdo->hash_zones);
+}
+
+/**
+ * vdo_assert_on_admin_thread() - Assert that we are running on the admin thread.
+ * @vdo: The vdo.
+ * @name: The name of the function which should be running on the admin thread (for logging).
+ */
+void vdo_assert_on_admin_thread(const struct vdo *vdo, const char *name)
+{
+	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == vdo->thread_config.admin_thread),
+			"%s called on admin thread",
+			name);
+}
+
+/**
+ * vdo_assert_on_logical_zone_thread() - Assert that this function was called on the specified
+ *                                       logical zone thread.
+ * @vdo: The vdo.
+ * @logical_zone: The number of the logical zone.
+ * @name: The name of the calling function.
+ */
+void vdo_assert_on_logical_zone_thread(const struct vdo *vdo,
+				       zone_count_t logical_zone,
+				       const char *name)
+{
+	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() ==
+			 vdo->thread_config.logical_threads[logical_zone]),
+			"%s called on logical thread",
+			name);
+}
+
+/**
+ * vdo_assert_on_physical_zone_thread() - Assert that this function was called on the specified
+ *                                        physical zone thread.
+ * @vdo: The vdo.
+ * @physical_zone: The number of the physical zone.
+ * @name: The name of the calling function.
+ */
+void vdo_assert_on_physical_zone_thread(const struct vdo *vdo,
+					zone_count_t physical_zone,
+					const char *name)
+{
+	ASSERT_LOG_ONLY((vdo_get_callback_thread_id() ==
+			 vdo->thread_config.physical_threads[physical_zone]),
+			"%s called on physical thread",
+			name);
+}
+
+/**
+ * vdo_get_physical_zone() - Get the physical zone responsible for a given physical block number.
+ * @vdo: The vdo containing the physical zones.
+ * @pbn: The PBN of the data block.
+ * @zone_ptr: A pointer to return the physical zone.
+ *
+ * Gets the physical zone responsible for a given physical block number of a data block in this vdo
+ * instance, or of the zero block (for which a NULL zone is returned). For any other block number
+ * that is not in the range of valid data block numbers in any slab, an error will be returned.
+ * This function is safe to call on invalid block numbers; it will not put the vdo into read-only
+ * mode.
+ *
+ * Return: VDO_SUCCESS or VDO_OUT_OF_RANGE if the block number is invalid or an error code for any
+ *         other failure.
+ */
+int vdo_get_physical_zone(const struct vdo *vdo,
+			  physical_block_number_t pbn,
+			  struct physical_zone **zone_ptr)
+{
+	struct vdo_slab *slab;
+	int result;
+
+	if (pbn == VDO_ZERO_BLOCK) {
+		*zone_ptr = NULL;
+		return VDO_SUCCESS;
+	}
+
+	/*
+	 * Used because it does a more restrictive bounds check than vdo_get_slab(), and done first
+	 * because it won't trigger read-only mode on an invalid PBN.
+	 */
+	if (!vdo_is_physical_data_block(vdo->depot, pbn))
+		return VDO_OUT_OF_RANGE;
+
+	/* With the PBN already checked, we should always succeed in finding a slab. */
+	slab = vdo_get_slab(vdo->depot, pbn);
+	result = ASSERT(slab != NULL, "vdo_get_slab must succeed on all valid PBNs");
+	if (result != VDO_SUCCESS)
+		return result;
+
+	*zone_ptr = &vdo->physical_zones->zones[slab->allocator->zone_number];
+	return VDO_SUCCESS;
+}
diff --git a/drivers/md/dm-vdo/vdo.h b/drivers/md/dm-vdo/vdo.h
new file mode 100644
index 00000000000..40e6d32fafa
--- /dev/null
+++ b/drivers/md/dm-vdo/vdo.h
@@ -0,0 +1,381 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_H
+#define VDO_H
+
+#include <linux/atomic.h>
+#include <linux/blk_types.h>
+#include <linux/completion.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include "admin-state.h"
+#include "encodings.h"
+#include "packer.h"
+#include "physical-zone.h"
+#include "statistics.h"
+#include "thread-registry.h"
+#include "types.h"
+#include "uds.h"
+#include "work-queue.h"
+
+enum notifier_state {
+	/** Notifications are allowed but not in progress */
+	MAY_NOTIFY,
+	/** A notification is in progress */
+	NOTIFYING,
+	/** Notifications are not allowed */
+	MAY_NOT_NOTIFY,
+	/** A notification has completed */
+	NOTIFIED,
+};
+
+/**
+ * typedef vdo_read_only_notification - A function to notify a listener that the VDO has gone
+ *                                      read-only.
+ * @listener: The object to notify.
+ * @parent: The completion to notify in order to acknowledge the notification.
+ */
+typedef void vdo_read_only_notification(void *listener, struct vdo_completion *parent);
+
+/*
+ * An object to be notified when the VDO enters read-only mode
+ */
+struct read_only_listener {
+	/* The listener */
+	void *listener;
+	/* The method to call to notify the listener */
+	vdo_read_only_notification *notify;
+	/* A pointer to the next listener */
+	struct read_only_listener *next;
+};
+
+struct vdo_thread {
+	struct vdo *vdo;
+	thread_id_t thread_id;
+	struct vdo_work_queue *queue;
+	/*
+	 * Each thread maintains its own notion of whether the VDO is read-only so that the
+	 * read-only state can be checked from any base thread without worrying about
+	 * synchronization or thread safety. This does mean that knowledge of the VDO going
+	 * read-only does not occur simultaneously across the VDO's threads, but that does not seem
+	 * to cause any problems.
+	 */
+	bool is_read_only;
+	/*
+	 * A list of objects waiting to be notified on this thread that the VDO has entered
+	 * read-only mode.
+	 */
+	struct read_only_listener *listeners;
+	struct registered_thread allocating_thread;
+};
+
+/* Keep struct bio statistics atomically */
+struct atomic_bio_stats {
+	atomic64_t read; /* Number of not REQ_WRITE bios */
+	atomic64_t write; /* Number of REQ_WRITE bios */
+	atomic64_t discard; /* Number of REQ_DISCARD bios */
+	atomic64_t flush; /* Number of REQ_FLUSH bios */
+	atomic64_t empty_flush; /* Number of REQ_PREFLUSH bios without data */
+	atomic64_t fua; /* Number of REQ_FUA bios */
+};
+
+/* Counters are atomic since updates can arrive concurrently from arbitrary threads. */
+struct atomic_statistics {
+	atomic64_t bios_submitted;
+	atomic64_t bios_completed;
+	atomic64_t flush_out;
+	atomic64_t invalid_advice_pbn_count;
+	atomic64_t no_space_error_count;
+	atomic64_t read_only_error_count;
+	struct atomic_bio_stats bios_in;
+	struct atomic_bio_stats bios_in_partial;
+	struct atomic_bio_stats bios_out;
+	struct atomic_bio_stats bios_out_completed;
+	struct atomic_bio_stats bios_acknowledged;
+	struct atomic_bio_stats bios_acknowledged_partial;
+	struct atomic_bio_stats bios_meta;
+	struct atomic_bio_stats bios_meta_completed;
+	struct atomic_bio_stats bios_journal;
+	struct atomic_bio_stats bios_journal_completed;
+	struct atomic_bio_stats bios_page_cache;
+	struct atomic_bio_stats bios_page_cache_completed;
+};
+
+struct read_only_notifier {
+	/* The completion for entering read-only mode */
+	struct vdo_completion completion;
+	/* A completion waiting for notifications to be drained or enabled */
+	struct vdo_completion *waiter;
+	/* Lock to protect the next two fields */
+	spinlock_t lock;
+	/* The code of the error which put the VDO into read-only mode */
+	int read_only_error;
+	/* The current state of the notifier (values described above) */
+	enum notifier_state state;
+};
+
+/*
+ * The thread ID returned when the current thread is not a vdo thread, or can not be determined
+ * (usually due to being at interrupt context).
+ */
+#define VDO_INVALID_THREAD_ID ((thread_id_t) -1)
+
+struct thread_config {
+	zone_count_t logical_zone_count;
+	zone_count_t physical_zone_count;
+	zone_count_t hash_zone_count;
+	thread_count_t bio_thread_count;
+	thread_count_t thread_count;
+	thread_id_t admin_thread;
+	thread_id_t journal_thread;
+	thread_id_t packer_thread;
+	thread_id_t dedupe_thread;
+	thread_id_t bio_ack_thread;
+	thread_id_t cpu_thread;
+	thread_id_t *logical_threads;
+	thread_id_t *physical_threads;
+	thread_id_t *hash_zone_threads;
+	thread_id_t *bio_threads;
+};
+
+struct thread_count_config;
+
+struct vdo_super_block {
+	/* The vio for reading and writing the super block to disk */
+	struct vio vio;
+	/* A buffer to hold the super block */
+	u8 *buffer;
+	/* Whether this super block may not be written */
+	bool unwriteable;
+};
+
+struct data_vio_pool;
+
+struct vdo_administrator {
+	struct vdo_completion completion;
+	struct admin_state state;
+	atomic_t busy;
+	u32 phase;
+	struct completion callback_sync;
+};
+
+struct vdo {
+	char thread_name_prefix[MAX_VDO_WORK_QUEUE_NAME_LEN];
+	struct vdo_thread *threads;
+	vdo_action *action;
+	struct vdo_completion *completion;
+	struct vio_tracer *vio_tracer;
+
+	/* The atomic version of the state of this vdo */
+	atomic_t state;
+	/* The full state of all components */
+	struct vdo_component_states states;
+	/*
+	 * A counter value to attach to thread names and log messages to identify the individual
+	 * device.
+	 */
+	unsigned int instance;
+	/* The read-only notifier */
+	struct read_only_notifier read_only_notifier;
+	/* The load-time configuration of this vdo */
+	struct device_config *device_config;
+	/* The thread mapping */
+	struct thread_config thread_config;
+
+	/* The super block */
+	struct vdo_super_block super_block;
+
+	/* The partitioning of the underlying storage */
+	struct layout layout;
+	struct layout next_layout;
+	struct dm_kcopyd_client *partition_copier;
+
+	/* The block map */
+	struct block_map *block_map;
+
+	/* The journal for block map recovery */
+	struct recovery_journal *recovery_journal;
+
+	/* The slab depot */
+	struct slab_depot *depot;
+
+	/* The compressed-block packer */
+	struct packer *packer;
+	/* Whether incoming data should be compressed */
+	bool compressing;
+
+	/* The handler for flush requests */
+	struct flusher *flusher;
+
+	/* The state the vdo was in when loaded (primarily for unit tests) */
+	enum vdo_state load_state;
+
+	/* The logical zones of this vdo */
+	struct logical_zones *logical_zones;
+
+	/* The physical zones of this vdo */
+	struct physical_zones *physical_zones;
+
+	/* The hash lock zones of this vdo */
+	struct hash_zones *hash_zones;
+
+	/* Bio submission manager used for sending bios to the storage device. */
+	struct io_submitter *io_submitter;
+
+	/* The pool of data_vios for servicing incoming bios */
+	struct data_vio_pool *data_vio_pool;
+
+	/* The manager for administrative operations */
+	struct vdo_administrator admin;
+
+	/* Flags controlling administrative operations */
+	const struct admin_state_code *suspend_type;
+	bool allocations_allowed;
+	bool dump_on_shutdown;
+	atomic_t processing_message;
+
+	/*
+	 * Statistics
+	 * Atomic stats counters
+	 */
+	struct atomic_statistics stats;
+	/* Used to gather statistics without allocating memory */
+	struct vdo_statistics stats_buffer;
+	/* Protects the stats_buffer */
+	struct mutex stats_mutex;
+	/* true if sysfs directory is set up */
+	bool sysfs_added;
+	/* Used when shutting down the sysfs statistics */
+	struct completion stats_shutdown;
+
+
+	/* A list of all device_configs referencing this vdo */
+	struct list_head device_config_list;
+
+	/* This VDO's list entry for the device registry */
+	struct list_head registration;
+
+	/* Underlying block device info. */
+	u64 starting_sector_offset;
+	struct volume_geometry geometry;
+
+	/* For sysfs */
+	struct kobject vdo_directory;
+	struct kobject stats_directory;
+
+	/* N blobs of context data for LZ4 code, one per CPU thread. */
+	char **compression_context;
+};
+
+
+/**
+ * vdo_uses_bio_ack_queue() - Indicate whether the vdo is configured to use a separate work queue
+ *                            for acknowledging received and processed bios.
+ * @vdo: The vdo.
+ *
+ * Note that this directly controls the handling of write operations, but the compile-time flag
+ * VDO_USE_BIO_ACK_QUEUE_FOR_READ is also checked for read operations.
+ *
+ * Return: Whether a bio-acknowledgement work queue is in use.
+ */
+static inline bool vdo_uses_bio_ack_queue(struct vdo *vdo)
+{
+	return vdo->device_config->thread_counts.bio_ack_threads > 0;
+}
+
+/**
+ * typedef vdo_filter_t - Method type for vdo matching methods.
+ *
+ * A filter function returns false if the vdo doesn't match.
+ */
+typedef bool vdo_filter_t(struct vdo *vdo, const void *context);
+
+void vdo_initialize_device_registry_once(void);
+struct vdo * __must_check vdo_find_matching(vdo_filter_t *filter, const void *context);
+
+int __must_check vdo_make_thread(struct vdo *vdo,
+				 thread_id_t thread_id,
+				 const struct vdo_work_queue_type *type,
+				 unsigned int queue_count,
+				 void *contexts[]);
+
+static inline int __must_check vdo_make_default_thread(struct vdo *vdo, thread_id_t thread_id)
+{
+	return vdo_make_thread(vdo, thread_id, NULL, 1, NULL);
+}
+
+int __must_check
+vdo_make(unsigned int instance, struct device_config *config, char **reason, struct vdo **vdo_ptr);
+
+void vdo_destroy(struct vdo *vdo);
+
+void vdo_load_super_block(struct vdo *vdo, struct vdo_completion *parent);
+
+int __must_check vdo_add_sysfs_stats_dir(struct vdo *vdo);
+
+struct block_device * __must_check vdo_get_backing_device(const struct vdo *vdo);
+
+const char * __must_check vdo_get_device_name(const struct dm_target *target);
+
+int __must_check vdo_synchronous_flush(struct vdo *vdo);
+
+const struct admin_state_code * __must_check vdo_get_admin_state(const struct vdo *vdo);
+
+bool vdo_set_compressing(struct vdo *vdo, bool enable);
+
+bool vdo_get_compressing(struct vdo *vdo);
+
+void vdo_fetch_statistics(struct vdo *vdo, struct vdo_statistics *stats);
+
+thread_id_t vdo_get_callback_thread_id(void);
+
+enum vdo_state __must_check vdo_get_state(const struct vdo *vdo);
+
+void vdo_set_state(struct vdo *vdo, enum vdo_state state);
+
+void vdo_save_components(struct vdo *vdo, struct vdo_completion *parent);
+
+int vdo_register_read_only_listener(struct vdo *vdo,
+				    void *listener,
+				    vdo_read_only_notification *notification,
+				    thread_id_t thread_id);
+
+int vdo_enable_read_only_entry(struct vdo *vdo);
+
+void vdo_wait_until_not_entering_read_only_mode(struct vdo_completion *parent);
+
+void vdo_allow_read_only_mode_entry(struct vdo_completion *parent);
+
+void vdo_enter_read_only_mode(struct vdo *vdo, int error_code);
+
+bool __must_check vdo_is_read_only(struct vdo *vdo);
+
+bool __must_check vdo_in_read_only_mode(const struct vdo *vdo);
+
+bool __must_check vdo_in_recovery_mode(const struct vdo *vdo);
+
+void vdo_enter_recovery_mode(struct vdo *vdo);
+
+void vdo_assert_on_admin_thread(const struct vdo *vdo, const char *name);
+
+void vdo_assert_on_logical_zone_thread(const struct vdo *vdo,
+				       zone_count_t logical_zone,
+				       const char *name);
+
+void vdo_assert_on_physical_zone_thread(const struct vdo *vdo,
+					zone_count_t physical_zone,
+					const char *name);
+
+int __must_check vdo_get_physical_zone(const struct vdo *vdo,
+				       physical_block_number_t pbn,
+				       struct physical_zone **zone_ptr);
+
+void vdo_dump_status(const struct vdo *vdo);
+
+#endif /* VDO_H */
diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c
new file mode 100644
index 00000000000..e42b13bf1cd
--- /dev/null
+++ b/drivers/md/dm-vdo/vio.c
@@ -0,0 +1,525 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "vio.h"
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/kernel.h>
+#include <linux/ratelimit.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "constants.h"
+#include "io-submitter.h"
+#include "vdo.h"
+
+/* A vio_pool is a collection of preallocated vios. */
+struct vio_pool {
+	/** The number of objects managed by the pool */
+	size_t size;
+	/** The list of objects which are available */
+	struct list_head available;
+	/** The queue of requestors waiting for objects from the pool */
+	struct wait_queue waiting;
+	/** The number of objects currently in use */
+	size_t busy_count;
+	/** The list of objects which are in use */
+	struct list_head busy;
+	/** The ID of the thread on which this pool may be used */
+	thread_id_t thread_id;
+	/** The buffer backing the pool's vios */
+	char *buffer;
+	/** The pool entries */
+	struct pooled_vio vios[];
+};
+
+physical_block_number_t pbn_from_vio_bio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct vdo *vdo = vio->completion.vdo;
+	physical_block_number_t pbn = bio->bi_iter.bi_sector / VDO_SECTORS_PER_BLOCK;
+
+	return ((pbn == VDO_GEOMETRY_BLOCK_LOCATION) ? pbn : pbn + vdo->geometry.bio_offset);
+}
+
+static int create_multi_block_bio(block_count_t size, struct bio **bio_ptr)
+{
+	struct bio *bio = NULL;
+	int result;
+
+	result = UDS_ALLOCATE_EXTENDED(struct bio, size + 1, struct bio_vec, "bio", &bio);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	*bio_ptr = bio;
+	return VDO_SUCCESS;
+}
+
+int vdo_create_bio(struct bio **bio_ptr)
+{
+	return create_multi_block_bio(1, bio_ptr);
+}
+
+void vdo_free_bio(struct bio *bio)
+{
+	if (bio == NULL)
+		return;
+
+	bio_uninit(bio);
+	UDS_FREE(UDS_FORGET(bio));
+}
+
+int allocate_vio_components(struct vdo *vdo,
+			    enum vio_type vio_type,
+			    enum vio_priority priority,
+			    void *parent,
+			    unsigned int block_count,
+			    char *data,
+			    struct vio *vio)
+{
+	struct bio *bio;
+	int result;
+
+	result = ASSERT(block_count <= MAX_BLOCKS_PER_VIO,
+			"block count %u does not exceed maximum %u",
+			block_count,
+			MAX_BLOCKS_PER_VIO);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = ASSERT(((vio_type != VIO_TYPE_UNINITIALIZED) && (vio_type != VIO_TYPE_DATA)),
+			"%d is a metadata type",
+			vio_type);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = create_multi_block_bio(block_count, &bio);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	initialize_vio(vio, bio, block_count, vio_type, priority, vdo);
+	vio->completion.parent = parent;
+	vio->data = data;
+	return VDO_SUCCESS;
+}
+
+/**
+ * create_multi_block_metadata_vio() - Create a vio.
+ * @vdo: The vdo on which the vio will operate.
+ * @vio_type: The type of vio to create.
+ * @priority: The relative priority to assign to the vio.
+ * @parent: The parent of the vio.
+ * @block_count: The size of the vio in blocks.
+ * @data: The buffer.
+ * @vio_ptr: A pointer to hold the new vio.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int create_multi_block_metadata_vio(struct vdo *vdo,
+				    enum vio_type vio_type,
+				    enum vio_priority priority,
+				    void *parent,
+				    unsigned int block_count,
+				    char *data,
+				    struct vio **vio_ptr)
+{
+	struct vio *vio;
+	int result;
+
+	/* If struct vio grows past 256 bytes, we'll lose benefits of VDOSTORY-176. */
+	STATIC_ASSERT(sizeof(struct vio) <= 256);
+
+	/*
+	 * Metadata vios should use direct allocation and not use the buffer pool, which is
+	 * reserved for submissions from the linux block layer.
+	 */
+	result = UDS_ALLOCATE(1, struct vio, __func__, &vio);
+	if (result != VDO_SUCCESS) {
+		uds_log_error("metadata vio allocation failure %d", result);
+		return result;
+	}
+
+	result = allocate_vio_components(vdo, vio_type, priority, parent, block_count, data, vio);
+	if (result != VDO_SUCCESS) {
+		UDS_FREE(vio);
+		return result;
+	}
+
+	*vio_ptr  = vio;
+	return VDO_SUCCESS;
+}
+
+/**
+ * free_vio_components(): Free the components of a vio embedded in a larger structure.
+ * @vio: The vio to destroy
+ */
+void free_vio_components(struct vio *vio)
+{
+	if (vio == NULL)
+		return;
+
+	BUG_ON(is_data_vio(vio));
+	vdo_free_bio(UDS_FORGET(vio->bio));
+}
+
+/**
+ * free_vio() - Destroy a vio.
+ * @vio: The vio to destroy.
+ */
+void free_vio(struct vio *vio)
+{
+	free_vio_components(vio);
+	UDS_FREE(vio);
+}
+
+/* Set bio properties for a VDO read or write. */
+void vdo_set_bio_properties(struct bio *bio,
+			    struct vio *vio,
+			    bio_end_io_t callback,
+			    unsigned int bi_opf,
+			    physical_block_number_t pbn)
+{
+	struct vdo *vdo = vio->completion.vdo;
+	struct device_config *config = vdo->device_config;
+
+	pbn -= vdo->geometry.bio_offset;
+	vio->bio_zone = ((pbn / config->thread_counts.bio_rotation_interval) %
+			 config->thread_counts.bio_threads);
+
+	bio->bi_private = vio;
+	bio->bi_end_io = callback;
+	bio->bi_opf = bi_opf;
+	bio->bi_iter.bi_sector = pbn * VDO_SECTORS_PER_BLOCK;
+}
+
+/*
+ * Prepares the bio to perform IO with the specified buffer. May only be used on a VDO-allocated
+ * bio, as it assumes the bio wraps a 4k buffer that is 4k aligned, but there does not have to be a
+ * vio associated with the bio.
+ */
+int vio_reset_bio(struct vio *vio,
+		  char *data,
+		  bio_end_io_t callback,
+		  unsigned int bi_opf,
+		  physical_block_number_t pbn)
+{
+	int bvec_count, offset, len, i;
+	struct bio *bio = vio->bio;
+
+	bio_reset(bio, bio->bi_bdev, bi_opf);
+	vdo_set_bio_properties(bio, vio, callback, bi_opf, pbn);
+	if (data == NULL)
+		return VDO_SUCCESS;
+
+	bio->bi_io_vec = bio->bi_inline_vecs;
+	bio->bi_max_vecs = vio->block_count + 1;
+	len = VDO_BLOCK_SIZE * vio->block_count;
+	offset = offset_in_page(data);
+	bvec_count = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+
+	/*
+	 * If we knew that data was always on one page, or contiguous pages, we wouldn't need the
+	 * loop. But if we're using vmalloc, it's not impossible that the data is in different
+	 * pages that can't be merged in bio_add_page...
+	 */
+	for (i = 0; (i < bvec_count) && (len > 0); i++) {
+		struct page *page;
+		int bytes_added;
+		int bytes = PAGE_SIZE - offset;
+
+		if (bytes > len)
+			bytes = len;
+
+		page = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data);
+		bytes_added = bio_add_page(bio, page, bytes, offset);
+
+		if (bytes_added != bytes)
+			return uds_log_error_strerror(VDO_BIO_CREATION_FAILED,
+						      "Could only add %i bytes to bio",
+						       bytes_added);
+
+		data += bytes;
+		len -= bytes;
+		offset = 0;
+	}
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * update_vio_error_stats() - Update per-vio error stats and log the error.
+ * @vio: The vio which got an error.
+ * @format: The format of the message to log (a printf style format).
+ */
+void update_vio_error_stats(struct vio *vio, const char *format, ...)
+{
+	static DEFINE_RATELIMIT_STATE(error_limiter,
+				      DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+	va_list args;
+	int priority;
+	struct vdo *vdo = vio->completion.vdo;
+
+	switch (vio->completion.result) {
+	case VDO_READ_ONLY:
+		atomic64_inc(&vdo->stats.read_only_error_count);
+		return;
+
+	case VDO_NO_SPACE:
+		atomic64_inc(&vdo->stats.no_space_error_count);
+		priority = UDS_LOG_DEBUG;
+		break;
+
+	default:
+		priority = UDS_LOG_ERR;
+	}
+
+	if (!__ratelimit(&error_limiter))
+		return;
+
+	va_start(args, format);
+	uds_vlog_strerror(priority, vio->completion.result, UDS_LOGGING_MODULE_NAME, format, args);
+	va_end(args);
+}
+
+void vio_record_metadata_io_error(struct vio *vio)
+{
+	const char *description;
+	physical_block_number_t pbn = pbn_from_vio_bio(vio->bio);
+
+	if (bio_op(vio->bio) == REQ_OP_READ)
+		description = "read";
+	else if ((vio->bio->bi_opf & REQ_PREFLUSH) == REQ_PREFLUSH)
+		description = (((vio->bio->bi_opf & REQ_FUA) == REQ_FUA) ?
+			       "write+preflush+fua" :
+			       "write+preflush");
+	else if ((vio->bio->bi_opf & REQ_FUA) == REQ_FUA)
+		description = "write+fua";
+	else
+		description = "write";
+
+	update_vio_error_stats(vio,
+			       "Completing %s vio of type %u for physical block %llu with error",
+			       description,
+			       vio->type,
+			       (unsigned long long) pbn);
+}
+
+/**
+ * make_vio_pool() - Create a new vio pool.
+ * @vdo: The vdo.
+ * @pool_size: The number of vios in the pool.
+ * @thread_id: The ID of the thread using this pool.
+ * @vio_type: The type of vios in the pool.
+ * @priority: The priority with which vios from the pool should be enqueued.
+ * @context: The context that each entry will have.
+ * @pool_ptr: The resulting pool.
+ *
+ * Return: A success or error code.
+ */
+int make_vio_pool(struct vdo *vdo,
+		  size_t pool_size,
+		  thread_id_t thread_id,
+		  enum vio_type vio_type,
+		  enum vio_priority priority,
+		  void *context,
+		  struct vio_pool **pool_ptr)
+{
+	struct vio_pool *pool;
+	char *ptr;
+	int result;
+
+	result = UDS_ALLOCATE_EXTENDED(struct vio_pool,
+				       pool_size,
+				       struct pooled_vio,
+				       __func__,
+				       &pool);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	pool->thread_id = thread_id;
+	INIT_LIST_HEAD(&pool->available);
+	INIT_LIST_HEAD(&pool->busy);
+
+	result = UDS_ALLOCATE(pool_size * VDO_BLOCK_SIZE, char, "VIO pool buffer", &pool->buffer);
+	if (result != VDO_SUCCESS) {
+		free_vio_pool(pool);
+		return result;
+	}
+
+	ptr = pool->buffer;
+	for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += VDO_BLOCK_SIZE) {
+		struct pooled_vio *pooled = &pool->vios[pool->size];
+
+		result = allocate_vio_components(vdo,
+						 vio_type,
+						 priority,
+						 NULL,
+						 1,
+						 ptr,
+						 &pooled->vio);
+		if (result != VDO_SUCCESS) {
+			free_vio_pool(pool);
+			return result;
+		}
+
+		pooled->context = context;
+		list_add_tail(&pooled->pool_entry, &pool->available);
+	}
+
+	*pool_ptr = pool;
+	return VDO_SUCCESS;
+}
+
+/**
+ * free_vio_pool() - Destroy a vio pool.
+ * @pool: The pool to free.
+ */
+void free_vio_pool(struct vio_pool *pool)
+{
+	struct pooled_vio *pooled, *tmp;
+
+	if (pool == NULL)
+		return;
+
+	/* Remove all available vios from the object pool. */
+	ASSERT_LOG_ONLY(!vdo_has_waiters(&pool->waiting),
+			"VIO pool must not have any waiters when being freed");
+	ASSERT_LOG_ONLY((pool->busy_count == 0),
+			"VIO pool must not have %zu busy entries when being freed",
+			pool->busy_count);
+	ASSERT_LOG_ONLY(list_empty(&pool->busy),
+			"VIO pool must not have busy entries when being freed");
+
+	list_for_each_entry_safe(pooled, tmp, &pool->available, pool_entry) {
+		list_del(&pooled->pool_entry);
+		free_vio_components(&pooled->vio);
+		pool->size--;
+	}
+
+	ASSERT_LOG_ONLY(pool->size == 0,
+			"VIO pool must not have missing entries when being freed");
+
+	UDS_FREE(UDS_FORGET(pool->buffer));
+	UDS_FREE(pool);
+}
+
+/**
+ * is_vio_pool_busy() - Check whether an vio pool has outstanding entries.
+ *
+ * Return: true if the pool is busy.
+ */
+bool is_vio_pool_busy(struct vio_pool *pool)
+{
+	return (pool->busy_count != 0);
+}
+
+/**
+ * acquire_vio_from_pool() - Acquire a vio and buffer from the pool (asynchronous).
+ * @pool: The vio pool.
+ * @waiter: Object that is requesting a vio.
+ */
+void acquire_vio_from_pool(struct vio_pool *pool, struct waiter *waiter)
+{
+	struct pooled_vio *pooled;
+
+	ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()),
+			"acquire from active vio_pool called from correct thread");
+
+	if (list_empty(&pool->available)) {
+		vdo_enqueue_waiter(&pool->waiting, waiter);
+		return;
+	}
+
+	pooled = list_first_entry(&pool->available, struct pooled_vio, pool_entry);
+	pool->busy_count++;
+	list_move_tail(&pooled->pool_entry, &pool->busy);
+	(*waiter->callback)(waiter, pooled);
+}
+
+/**
+ * return_vio_to_pool() - Return a vio to the pool
+ * @pool: The vio pool.
+ * @vio: The pooled vio to return.
+ */
+void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio)
+{
+	ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()),
+			"vio pool entry returned on same thread as it was acquired");
+
+	vio->vio.completion.error_handler = NULL;
+	vio->vio.completion.parent = NULL;
+	if (vdo_has_waiters(&pool->waiting)) {
+		vdo_notify_next_waiter(&pool->waiting, NULL, vio);
+		return;
+	}
+
+	list_move_tail(&vio->pool_entry, &pool->available);
+	--pool->busy_count;
+}
+
+/*
+ * Various counting functions for statistics.
+ * These are used for bios coming into VDO, as well as bios generated by VDO.
+ */
+void vdo_count_bios(struct atomic_bio_stats *bio_stats, struct bio *bio)
+{
+	if (((bio->bi_opf & REQ_PREFLUSH) != 0) && (bio->bi_iter.bi_size == 0)) {
+		atomic64_inc(&bio_stats->empty_flush);
+		atomic64_inc(&bio_stats->flush);
+		return;
+	}
+
+	switch (bio_op(bio)) {
+	case REQ_OP_WRITE:
+		atomic64_inc(&bio_stats->write);
+		break;
+	case REQ_OP_READ:
+		atomic64_inc(&bio_stats->read);
+		break;
+	case REQ_OP_DISCARD:
+		atomic64_inc(&bio_stats->discard);
+		break;
+		/*
+		 * All other operations are filtered out in dmvdo.c, or not created by VDO, so
+		 * shouldn't exist.
+		 */
+	default:
+		ASSERT_LOG_ONLY(0, "Bio operation %d not a write, read, discard, or empty flush",
+				bio_op(bio));
+	}
+
+	if ((bio->bi_opf & REQ_PREFLUSH) != 0)
+		atomic64_inc(&bio_stats->flush);
+	if (bio->bi_opf & REQ_FUA)
+		atomic64_inc(&bio_stats->fua);
+}
+
+static void count_all_bios_completed(struct vio *vio, struct bio *bio)
+{
+	struct atomic_statistics *stats = &vio->completion.vdo->stats;
+
+	if (is_data_vio(vio)) {
+		vdo_count_bios(&stats->bios_out_completed, bio);
+		return;
+	}
+
+	vdo_count_bios(&stats->bios_meta_completed, bio);
+	if (vio->type == VIO_TYPE_RECOVERY_JOURNAL)
+		vdo_count_bios(&stats->bios_journal_completed, bio);
+	else if (vio->type == VIO_TYPE_BLOCK_MAP)
+		vdo_count_bios(&stats->bios_page_cache_completed, bio);
+}
+
+void vdo_count_completed_bios(struct bio *bio)
+{
+	struct vio *vio = (struct vio *) bio->bi_private;
+
+	atomic64_inc(&vio->completion.vdo->stats.bios_completed);
+	count_all_bios_completed(vio, bio);
+}
diff --git a/drivers/md/dm-vdo/vio.h b/drivers/md/dm-vdo/vio.h
new file mode 100644
index 00000000000..f39f568834e
--- /dev/null
+++ b/drivers/md/dm-vdo/vio.h
@@ -0,0 +1,221 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VIO_H
+#define VIO_H
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+#include "completion.h"
+#include "constants.h"
+#include "types.h"
+#include "vdo.h"
+
+enum {
+	MAX_BLOCKS_PER_VIO = (BIO_MAX_VECS << PAGE_SHIFT) / VDO_BLOCK_SIZE,
+};
+
+struct pooled_vio {
+	/* The underlying vio */
+	struct vio vio;
+	/* The list entry for chaining pooled vios together */
+	struct list_head list_entry;
+	/* The context set by the pool */
+	void *context;
+	/* The list entry used by the pool */
+	struct list_head pool_entry;
+};
+
+/**
+ * as_vio() - Convert a generic vdo_completion to a vio.
+ * @completion: The completion to convert.
+ *
+ * Return: The completion as a vio.
+ */
+static inline struct vio *as_vio(struct vdo_completion *completion)
+{
+	vdo_assert_completion_type(completion, VIO_COMPLETION);
+	return container_of(completion, struct vio, completion);
+}
+
+/**
+ * get_vio_bio_zone_thread_id() - Get the thread id of the bio zone in which a vio should submit
+ *                                its I/O.
+ * @vio: The vio.
+ *
+ * Return: The id of the bio zone thread the vio should use.
+ */
+static inline thread_id_t __must_check get_vio_bio_zone_thread_id(struct vio *vio)
+{
+	return vio->completion.vdo->thread_config.bio_threads[vio->bio_zone];
+}
+
+physical_block_number_t __must_check pbn_from_vio_bio(struct bio *bio);
+
+/**
+ * assert_vio_in_bio_zone() - Check that a vio is running on the correct thread for its bio zone.
+ * @vio: The vio to check.
+ */
+static inline void assert_vio_in_bio_zone(struct vio *vio)
+{
+	thread_id_t expected = get_vio_bio_zone_thread_id(vio);
+	thread_id_t thread_id = vdo_get_callback_thread_id();
+
+	ASSERT_LOG_ONLY((expected == thread_id),
+			"vio I/O for physical block %llu on thread %u, should be on bio zone thread %u",
+			(unsigned long long) pbn_from_vio_bio(vio->bio),
+			thread_id,
+			expected);
+}
+
+int vdo_create_bio(struct bio **bio_ptr);
+void vdo_free_bio(struct bio *bio);
+int allocate_vio_components(struct vdo *vdo,
+			    enum vio_type vio_type,
+			    enum vio_priority priority,
+			    void *parent,
+			    unsigned int block_count,
+			    char *data,
+			    struct vio *vio);
+int __must_check create_multi_block_metadata_vio(struct vdo *vdo,
+						 enum vio_type vio_type,
+						 enum vio_priority priority,
+						 void *parent,
+						 unsigned int block_count,
+						 char *data,
+						 struct vio **vio_ptr);
+
+static inline int __must_check
+create_metadata_vio(struct vdo *vdo,
+		    enum vio_type vio_type,
+		    enum vio_priority priority,
+		    void *parent,
+		    char *data,
+		    struct vio **vio_ptr)
+{
+	return create_multi_block_metadata_vio(vdo, vio_type, priority, parent, 1, data, vio_ptr);
+}
+
+void free_vio_components(struct vio *vio);
+void free_vio(struct vio *vio);
+
+/**
+ * initialize_vio() - Initialize a vio.
+ * @vio: The vio to initialize.
+ * @bio: The bio this vio should use for its I/O.
+ * @block_count: The size of this vio in vdo blocks.
+ * @vio_type: The vio type.
+ * @priority: The relative priority of the vio.
+ * @vdo: The vdo for this vio.
+ */
+static inline void initialize_vio(struct vio *vio,
+				  struct bio *bio,
+				  unsigned int block_count,
+				  enum vio_type vio_type,
+				  enum vio_priority priority,
+				  struct vdo *vdo)
+{
+	/* data_vio's may not span multiple blocks */
+	BUG_ON((vio_type == VIO_TYPE_DATA) && (block_count != 1));
+
+	vio->bio = bio;
+	vio->block_count = block_count;
+	vio->type = vio_type;
+	vio->priority = priority;
+	vdo_initialize_completion(&vio->completion, vdo, VIO_COMPLETION);
+}
+
+void vdo_set_bio_properties(struct bio *bio,
+			    struct vio *vio,
+			    bio_end_io_t callback,
+			    unsigned int bi_opf,
+			    physical_block_number_t pbn);
+
+int vio_reset_bio(struct vio *vio,
+		  char *data,
+		  bio_end_io_t callback,
+		  unsigned int bi_opf,
+		  physical_block_number_t pbn);
+
+void update_vio_error_stats(struct vio *vio, const char *format, ...)
+	__printf(2, 3);
+
+/**
+ * is_data_vio() - Check whether a vio is servicing an external data request.
+ * @vio: The vio to check.
+ */
+static inline bool is_data_vio(struct vio *vio)
+{
+	return (vio->type == VIO_TYPE_DATA);
+}
+
+/**
+ * get_metadata_priority() - Convert a vio's priority to a work item priority.
+ * @vio: The vio.
+ *
+ * Return: The priority with which to submit the vio's bio.
+ */
+static inline enum vdo_completion_priority get_metadata_priority(struct vio *vio)
+{
+	return ((vio->priority == VIO_PRIORITY_HIGH) ?
+		BIO_Q_HIGH_PRIORITY :
+		BIO_Q_METADATA_PRIORITY);
+}
+
+/**
+ * continue_vio() - Enqueue a vio to run its next callback.
+ * @vio: The vio to continue.
+ *
+ * Return: The result of the current operation.
+ */
+static inline void continue_vio(struct vio *vio, int result)
+{
+	if (unlikely(result != VDO_SUCCESS))
+		vdo_set_completion_result(&vio->completion, result);
+
+	vdo_enqueue_completion(&vio->completion, VDO_WORK_Q_DEFAULT_PRIORITY);
+}
+
+void vdo_count_bios(struct atomic_bio_stats *bio_stats, struct bio *bio);
+void vdo_count_completed_bios(struct bio *bio);
+
+/**
+ * continue_vio_after_io() - Continue a vio now that its I/O has returned.
+ */
+static inline void continue_vio_after_io(struct vio *vio, vdo_action *callback, thread_id_t thread)
+{
+	vdo_count_completed_bios(vio->bio);
+	vdo_set_completion_callback(&vio->completion, callback, thread);
+	continue_vio(vio, blk_status_to_errno(vio->bio->bi_status));
+}
+
+void vio_record_metadata_io_error(struct vio *vio);
+
+/* A vio_pool is a collection of preallocated vios used to write arbitrary metadata blocks. */
+
+static inline struct pooled_vio *vio_as_pooled_vio(struct vio *vio)
+{
+	return container_of(vio, struct pooled_vio, vio);
+}
+
+struct vio_pool;
+
+int __must_check make_vio_pool(struct vdo *vdo,
+			       size_t pool_size,
+			       thread_id_t thread_id,
+			       enum vio_type vio_type,
+			       enum vio_priority priority,
+			       void *context,
+			       struct vio_pool **pool_ptr);
+void free_vio_pool(struct vio_pool *pool);
+bool __must_check is_vio_pool_busy(struct vio_pool *pool);
+void acquire_vio_from_pool(struct vio_pool *pool, struct waiter *waiter);
+void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio);
+
+#endif /* VIO_H */
diff --git a/drivers/md/dm-vdo/wait-queue.c b/drivers/md/dm-vdo/wait-queue.c
new file mode 100644
index 00000000000..4048c11c3e5
--- /dev/null
+++ b/drivers/md/dm-vdo/wait-queue.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "wait-queue.h"
+
+#include <linux/device-mapper.h>
+
+#include "permassert.h"
+
+#include "status-codes.h"
+
+/**
+ * vdo_enqueue_waiter() - Add a waiter to the tail end of a wait queue.
+ * @queue: The queue to which to add the waiter.
+ * @waiter: The waiter to add to the queue.
+ *
+ * The waiter must not already be waiting in a queue.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+void vdo_enqueue_waiter(struct wait_queue *queue, struct waiter *waiter)
+{
+	BUG_ON(waiter->next_waiter != NULL);
+
+	if (queue->last_waiter == NULL) {
+		/*
+		 * The queue is empty, so form the initial circular list by self-linking the
+		 * initial waiter.
+		 */
+		waiter->next_waiter = waiter;
+	} else {
+		/* Splice the new waiter in at the end of the queue. */
+		waiter->next_waiter = queue->last_waiter->next_waiter;
+		queue->last_waiter->next_waiter = waiter;
+	}
+
+	/* In both cases, the waiter we added to the ring becomes the last waiter. */
+	queue->last_waiter = waiter;
+	queue->queue_length += 1;
+}
+
+/**
+ * vdo_transfer_all_waiters() - Transfer all waiters from one wait queue to a second queue,
+ *                              emptying the first queue.
+ * @from_queue: The queue containing the waiters to move.
+ * @to_queue: The queue that will receive the waiters from the first queue.
+ */
+void vdo_transfer_all_waiters(struct wait_queue *from_queue, struct wait_queue *to_queue)
+{
+	/* If the source queue is empty, there's nothing to do. */
+	if (!vdo_has_waiters(from_queue))
+		return;
+
+	if (vdo_has_waiters(to_queue)) {
+		/*
+		 * Both queues are non-empty. Splice the two circular lists together by swapping
+		 * the next (head) pointers in the list tails.
+		 */
+		struct waiter *from_head = from_queue->last_waiter->next_waiter;
+		struct waiter *to_head = to_queue->last_waiter->next_waiter;
+
+		to_queue->last_waiter->next_waiter = from_head;
+		from_queue->last_waiter->next_waiter = to_head;
+	}
+
+	to_queue->last_waiter = from_queue->last_waiter;
+	to_queue->queue_length += from_queue->queue_length;
+	vdo_initialize_wait_queue(from_queue);
+}
+
+/**
+ * vdo_notify_all_waiters() - Notify all the entries waiting in a queue.
+ * @queue: The wait queue containing the waiters to notify.
+ * @callback: The function to call to notify each waiter, or NULL to invoke the callback field
+ *            registered in each waiter.
+ * @context: The context to pass to the callback function.
+ *
+ * Notifies all the entries waiting in a queue to continue execution by invoking a callback
+ * function on each of them in turn. The queue is copied and emptied before invoking any callbacks,
+ * and only the waiters that were in the queue at the start of the call will be notified.
+ */
+void vdo_notify_all_waiters(struct wait_queue *queue, waiter_callback *callback, void *context)
+{
+	/*
+	 * Copy and empty the queue first, avoiding the possibility of an infinite loop if entries
+	 * are returned to the queue by the callback function.
+	 */
+	struct wait_queue waiters;
+
+	vdo_initialize_wait_queue(&waiters);
+	vdo_transfer_all_waiters(queue, &waiters);
+
+	/* Drain the copied queue, invoking the callback on every entry. */
+	while (vdo_notify_next_waiter(&waiters, callback, context))
+		/* All the work is done by the loop condition. */
+		;
+}
+
+/**
+ * vdo_get_first_waiter() - Return the waiter that is at the head end of a wait queue.
+ * @queue: The queue from which to get the first waiter.
+ *
+ * Return: The first (oldest) waiter in the queue, or NULL if the queue is empty.
+ */
+struct waiter *vdo_get_first_waiter(const struct wait_queue *queue)
+{
+	struct waiter *last_waiter = queue->last_waiter;
+
+	if (last_waiter == NULL)
+		/* There are no waiters, so we're done. */
+		return NULL;
+
+	/* The queue is circular, so the last entry links to the head of the queue. */
+	return last_waiter->next_waiter;
+}
+
+/**
+ * vdo_dequeue_matching_waiters() - Remove all waiters that match based on the specified matching
+ *                                  method and append them to a wait_queue.
+ * @queue: The wait queue to process.
+ * @match_method: The method to determine matching.
+ * @match_context: Contextual info for the match method.
+ * @matched_queue: A wait_queue to store matches.
+ */
+void vdo_dequeue_matching_waiters(struct wait_queue *queue,
+				  waiter_match *match_method,
+				  void *match_context,
+				  struct wait_queue *matched_queue)
+{
+	struct wait_queue matched_waiters, iteration_queue;
+
+	vdo_initialize_wait_queue(&matched_waiters);
+
+	vdo_initialize_wait_queue(&iteration_queue);
+	vdo_transfer_all_waiters(queue, &iteration_queue);
+	while (vdo_has_waiters(&iteration_queue)) {
+		struct waiter *waiter = vdo_dequeue_next_waiter(&iteration_queue);
+
+		vdo_enqueue_waiter((match_method(waiter, match_context) ?
+				    &matched_waiters :
+				    queue),
+				   waiter);
+	}
+
+	vdo_transfer_all_waiters(&matched_waiters, matched_queue);
+}
+
+/**
+ * vdo_dequeue_next_waiter() - Remove the first waiter from the head end of a wait queue.
+ * @queue: The wait queue from which to remove the first entry.
+ *
+ * The caller will be responsible for waking the waiter by invoking the correct callback function
+ * to resume its execution.
+ *
+ * Return: The first (oldest) waiter in the queue, or NULL if the queue is empty.
+ */
+struct waiter *vdo_dequeue_next_waiter(struct wait_queue *queue)
+{
+	struct waiter *first_waiter = vdo_get_first_waiter(queue);
+	struct waiter *last_waiter = queue->last_waiter;
+
+	if (first_waiter == NULL)
+		return NULL;
+
+	if (first_waiter == last_waiter)
+		/* The queue has a single entry, so just empty it out by nulling the tail. */
+		queue->last_waiter = NULL;
+	else
+		/*
+		 * The queue has more than one entry, so splice the first waiter out of the
+		 * circular queue.
+		 */
+		last_waiter->next_waiter = first_waiter->next_waiter;
+
+	/* The waiter is no longer in a wait queue. */
+	first_waiter->next_waiter = NULL;
+	queue->queue_length -= 1;
+	return first_waiter;
+}
+
+/**
+ * vdo_notify_next_waiter() - Notify the next entry waiting in a queue.
+ * @queue: The wait queue containing the waiter to notify.
+ * @callback: The function to call to notify the waiter, or NULL to invoke the callback field
+ *            registered in the waiter.
+ * @context: The context to pass to the callback function.
+ *
+ * Notifies the next entry waiting in a queue to continue execution by invoking a callback function
+ * on it after removing it from the queue.
+ *
+ * Return: true if there was a waiter in the queue.
+ */
+bool vdo_notify_next_waiter(struct wait_queue *queue, waiter_callback *callback, void *context)
+{
+	struct waiter *waiter = vdo_dequeue_next_waiter(queue);
+
+	if (waiter == NULL)
+		return false;
+
+	if (callback == NULL)
+		callback = waiter->callback;
+	(*callback)(waiter, context);
+	return true;
+}
+
+/**
+ * vdo_get_next_waiter() - Get the waiter after this one, for debug iteration.
+ * @queue: The wait queue.
+ * @waiter: A waiter.
+ *
+ * Return: The next waiter, or NULL.
+ */
+const struct waiter *
+vdo_get_next_waiter(const struct wait_queue *queue, const struct waiter *waiter)
+{
+	struct waiter *first_waiter = vdo_get_first_waiter(queue);
+
+	if (waiter == NULL)
+		return first_waiter;
+	return ((waiter->next_waiter != first_waiter) ? waiter->next_waiter : NULL);
+}
diff --git a/drivers/md/dm-vdo/wait-queue.h b/drivers/md/dm-vdo/wait-queue.h
new file mode 100644
index 00000000000..f5f8bb3a7a4
--- /dev/null
+++ b/drivers/md/dm-vdo/wait-queue.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_WAIT_QUEUE_H
+#define VDO_WAIT_QUEUE_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+/**
+ * DOC: Wait queues.
+ *
+ * A wait queue is a circular list of entries waiting to be notified of a change in a condition.
+ * Keeping a circular list allows the queue structure to simply be a pointer to the tail (newest)
+ * entry in the queue, supporting constant-time enqueue and dequeue operations. A null pointer is
+ * an empty queue.
+ *
+ *   An empty queue:
+ *     queue0.last_waiter -> NULL
+ *
+ *   A singleton queue:
+ *     queue1.last_waiter -> entry1 -> entry1 -> [...]
+ *
+ *   A three-element queue:
+ *     queue2.last_waiter -> entry3 -> entry1 -> entry2 -> entry3 -> [...]
+ */
+
+struct waiter;
+
+struct wait_queue {
+	/* The tail of the queue, the last (most recently added) entry */
+	struct waiter *last_waiter;
+	/* The number of waiters currently in the queue */
+	size_t queue_length;
+};
+
+/**
+ * typedef waiter_callback - Callback type for functions which will be called to resume processing
+ *                           of a waiter after it has been removed from its wait queue.
+ */
+typedef void waiter_callback(struct waiter *waiter, void *context);
+
+/**
+ * typedef waiter_match - Method type for waiter matching methods.
+ *
+ * A waiter_match method returns false if the waiter does not match.
+ */
+typedef bool waiter_match(struct waiter *waiter, void *context);
+
+/* The queue entry structure for entries in a wait_queue. */
+struct waiter {
+	/*
+	 * The next waiter in the queue. If this entry is the last waiter, then this is actually a
+	 * pointer back to the head of the queue.
+	 */
+	struct waiter *next_waiter;
+
+	/* Optional waiter-specific callback to invoke when waking this waiter. */
+	waiter_callback *callback;
+};
+
+/**
+ * is_waiting() -Check whether a waiter is waiting.
+ * @waiter: The waiter to check.
+ *
+ * Return: true if the waiter is on some wait_queue.
+ */
+static inline bool vdo_is_waiting(struct waiter *waiter)
+{
+	return (waiter->next_waiter != NULL);
+}
+
+/**
+ * initialize_wait_queue() - Initialize a wait queue.
+ * @queue: The queue to initialize.
+ */
+static inline void vdo_initialize_wait_queue(struct wait_queue *queue)
+{
+	*queue = (struct wait_queue) {
+		.last_waiter = NULL,
+		.queue_length = 0,
+	};
+}
+
+/**
+ * has_waiters() - Check whether a wait queue has any entries waiting in it.
+ * @queue: The queue to query.
+ *
+ * Return: true if there are any waiters in the queue.
+ */
+static inline bool __must_check vdo_has_waiters(const struct wait_queue *queue)
+{
+	return (queue->last_waiter != NULL);
+}
+
+void vdo_enqueue_waiter(struct wait_queue *queue, struct waiter *waiter);
+
+void vdo_notify_all_waiters(struct wait_queue *queue, waiter_callback *callback, void *context);
+
+bool vdo_notify_next_waiter(struct wait_queue *queue, waiter_callback *callback, void *context);
+
+void vdo_transfer_all_waiters(struct wait_queue *from_queue, struct wait_queue *to_queue);
+
+struct waiter *vdo_get_first_waiter(const struct wait_queue *queue);
+
+void vdo_dequeue_matching_waiters(struct wait_queue *queue,
+				  waiter_match *match_method,
+				  void *match_context,
+				  struct wait_queue *matched_queue);
+
+struct waiter *vdo_dequeue_next_waiter(struct wait_queue *queue);
+
+/**
+ * count_waiters() - Count the number of waiters in a wait queue.
+ * @queue: The wait queue to query.
+ *
+ * Return: The number of waiters in the queue.
+ */
+static inline size_t __must_check vdo_count_waiters(const struct wait_queue *queue)
+{
+	return queue->queue_length;
+}
+
+const struct waiter * __must_check
+vdo_get_next_waiter(const struct wait_queue *queue, const struct waiter *waiter);
+
+#endif /* VDO_WAIT_QUEUE_H */
diff --git a/drivers/md/dm-vdo/work-queue.c b/drivers/md/dm-vdo/work-queue.c
new file mode 100644
index 00000000000..40cf1bfb947
--- /dev/null
+++ b/drivers/md/dm-vdo/work-queue.c
@@ -0,0 +1,658 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright Red Hat
+ */
+
+#include "work-queue.h"
+
+#include <linux/atomic.h>
+#include <linux/cache.h>
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
+#include <linux/percpu.h>
+
+#include "funnel-queue.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+#include "string-utils.h"
+
+#include "completion.h"
+#include "status-codes.h"
+
+static DEFINE_PER_CPU(unsigned int, service_queue_rotor);
+
+/**
+ * DOC: Work queue definition.
+ *
+ * There are two types of work queues: simple, with one worker thread, and round-robin, which uses
+ * a group of the former to do the work, and assigns work to them in round-robin fashion (roughly).
+ * Externally, both are represented via the same common sub-structure, though there's actually not
+ * a great deal of overlap between the two types internally.
+ */
+struct vdo_work_queue {
+	/* Name of just the work queue (e.g., "cpuQ12") */
+	char *name;
+	bool round_robin_mode;
+	struct vdo_thread *owner;
+	/* Life cycle functions, etc */
+	const struct vdo_work_queue_type *type;
+};
+
+struct simple_work_queue {
+	struct vdo_work_queue common;
+	struct funnel_queue *priority_lists[VDO_WORK_Q_MAX_PRIORITY + 1];
+	void *private;
+
+	/*
+	 * The fields above are unchanged after setup but often read, and are good candidates for
+	 * caching -- and if the max priority is 2, just fit in one x86-64 cache line if aligned.
+	 * The fields below are often modified as we sleep and wake, so we want a separate cache
+	 * line for performance.
+	 */
+
+	/* Any (0 or 1) worker threads waiting for new work to do */
+	wait_queue_head_t waiting_worker_threads ____cacheline_aligned;
+	/* Hack to reduce wakeup calls if the worker thread is running */
+	atomic_t idle;
+
+	/* These are infrequently used so in terms of performance we don't care where they land. */
+	struct task_struct *thread;
+	/* Notify creator once worker has initialized */
+	struct completion *started;
+};
+
+struct round_robin_work_queue {
+	struct vdo_work_queue common;
+	struct simple_work_queue **service_queues;
+	unsigned int num_service_queues;
+};
+
+static inline struct simple_work_queue *as_simple_work_queue(struct vdo_work_queue *queue)
+{
+	return ((queue == NULL) ? NULL : container_of(queue, struct simple_work_queue, common));
+}
+
+static inline struct round_robin_work_queue *
+as_round_robin_work_queue(struct vdo_work_queue *queue)
+{
+	return ((queue == NULL) ?
+		 NULL :
+		 container_of(queue, struct round_robin_work_queue, common));
+}
+
+/* Processing normal completions. */
+
+/*
+ * Dequeue and return the next waiting completion, if any.
+ *
+ * We scan the funnel queues from highest priority to lowest, once; there is therefore a race
+ * condition where a high-priority completion can be enqueued followed by a lower-priority one, and
+ * we'll grab the latter (but we'll catch the high-priority item on the next call). If strict
+ * enforcement of priorities becomes necessary, this function will need fixing.
+ */
+static struct vdo_completion *poll_for_completion(struct simple_work_queue *queue)
+{
+	int i;
+
+	for (i = queue->common.type->max_priority; i >= 0; i--) {
+		struct funnel_queue_entry *link = uds_funnel_queue_poll(queue->priority_lists[i]);
+
+		if (link != NULL)
+			return container_of(link, struct vdo_completion, work_queue_entry_link);
+	}
+
+	return NULL;
+}
+
+static void
+enqueue_work_queue_completion(struct simple_work_queue *queue, struct vdo_completion *completion)
+{
+	ASSERT_LOG_ONLY(completion->my_queue == NULL,
+			"completion %px (fn %px) to enqueue (%px) is not already queued (%px)",
+			completion,
+			completion->callback,
+			queue,
+			completion->my_queue);
+	if (completion->priority == VDO_WORK_Q_DEFAULT_PRIORITY)
+		completion->priority = queue->common.type->default_priority;
+
+	if (ASSERT(completion->priority <= queue->common.type->max_priority,
+		   "priority is in range for queue") != VDO_SUCCESS)
+		completion->priority = 0;
+
+	completion->my_queue = &queue->common;
+
+	/* Funnel queue handles the synchronization for the put. */
+	uds_funnel_queue_put(queue->priority_lists[completion->priority],
+			     &completion->work_queue_entry_link);
+
+	/*
+	 * Due to how funnel queue synchronization is handled (just atomic operations), the
+	 * simplest safe implementation here would be to wake-up any waiting threads after
+	 * enqueueing each item. Even if the funnel queue is not empty at the time of adding an
+	 * item to the queue, the consumer thread may not see this since it is not guaranteed to
+	 * have the same view of the queue as a producer thread.
+	 *
+	 * However, the above is wasteful so instead we attempt to minimize the number of thread
+	 * wakeups. Using an idle flag, and careful ordering using memory barriers, we should be
+	 * able to determine when the worker thread might be asleep or going to sleep. We use
+	 * cmpxchg to try to take ownership (vs other producer threads) of the responsibility for
+	 * waking the worker thread, so multiple wakeups aren't tried at once.
+	 *
+	 * This was tuned for some x86 boxes that were handy; it's untested whether doing the read
+	 * first is any better or worse for other platforms, even other x86 configurations.
+	 */
+	smp_mb();
+	if ((atomic_read(&queue->idle) != 1) || (atomic_cmpxchg(&queue->idle, 1, 0) != 1))
+		return;
+
+	/* There's a maximum of one thread in this list. */
+	wake_up(&queue->waiting_worker_threads);
+}
+
+static void run_start_hook(struct simple_work_queue *queue)
+{
+	if (queue->common.type->start != NULL)
+		queue->common.type->start(queue->private);
+}
+
+static void run_finish_hook(struct simple_work_queue *queue)
+{
+	if (queue->common.type->finish != NULL)
+		queue->common.type->finish(queue->private);
+}
+
+/*
+ * Wait for the next completion to process, or until kthread_should_stop indicates that it's time
+ * for us to shut down.
+ *
+ * If kthread_should_stop says it's time to stop but we have pending completions return a
+ * completion.
+ *
+ * Also update statistics relating to scheduler interactions.
+ */
+static struct vdo_completion *wait_for_next_completion(struct simple_work_queue *queue)
+{
+	struct vdo_completion *completion;
+	DEFINE_WAIT(wait);
+
+	while (true) {
+		prepare_to_wait(&queue->waiting_worker_threads, &wait, TASK_INTERRUPTIBLE);
+		/*
+		 * Don't set the idle flag until a wakeup will not be lost.
+		 *
+		 * Force synchronization between setting the idle flag and checking the funnel
+		 * queue; the producer side will do them in the reverse order. (There's still a
+		 * race condition we've chosen to allow, because we've got a timeout below that
+		 * unwedges us if we hit it, but this may narrow the window a little.)
+		 */
+		atomic_set(&queue->idle, 1);
+		smp_mb(); /* store-load barrier between "idle" and funnel queue */
+
+		completion = poll_for_completion(queue);
+		if (completion != NULL)
+			break;
+
+		/*
+		 * We need to check for thread-stop after setting TASK_INTERRUPTIBLE state up
+		 * above. Otherwise, schedule() will put the thread to sleep and might miss a
+		 * wakeup from kthread_stop() call in vdo_finish_work_queue().
+		 */
+		if (kthread_should_stop())
+			break;
+
+		schedule();
+
+		/*
+		 * Most of the time when we wake, it should be because there's work to do. If it
+		 * was a spurious wakeup, continue looping.
+		 */
+		completion = poll_for_completion(queue);
+		if (completion != NULL)
+			break;
+	}
+
+	finish_wait(&queue->waiting_worker_threads, &wait);
+	atomic_set(&queue->idle, 0);
+
+	return completion;
+}
+
+static void process_completion(struct simple_work_queue *queue, struct vdo_completion *completion)
+{
+	if (ASSERT(completion->my_queue == &queue->common,
+		   "completion %px from queue %px marked as being in this queue (%px)",
+		   completion,
+		   queue,
+		   completion->my_queue) == UDS_SUCCESS)
+		completion->my_queue = NULL;
+
+	vdo_run_completion(completion);
+}
+
+static void service_work_queue(struct simple_work_queue *queue)
+{
+	run_start_hook(queue);
+
+	while (true) {
+		struct vdo_completion *completion = poll_for_completion(queue);
+
+		if (completion == NULL)
+			completion = wait_for_next_completion(queue);
+
+		if (completion == NULL)
+			/* No completions but kthread_should_stop() was triggered. */
+			break;
+
+		process_completion(queue, completion);
+
+		/*
+		 * Be friendly to a CPU that has other work to do, if the kernel has told us to.
+		 * This speeds up some performance tests; that "other work" might include other VDO
+		 * threads.
+		 */
+		if (need_resched())
+			cond_resched();
+	}
+
+	run_finish_hook(queue);
+}
+
+static int work_queue_runner(void *ptr)
+{
+	struct simple_work_queue *queue = ptr;
+
+	complete(queue->started);
+	service_work_queue(queue);
+	return 0;
+}
+
+/* Creation & teardown */
+
+static void free_simple_work_queue(struct simple_work_queue *queue)
+{
+	unsigned int i;
+
+	for (i = 0; i <= VDO_WORK_Q_MAX_PRIORITY; i++)
+		uds_free_funnel_queue(queue->priority_lists[i]);
+	UDS_FREE(queue->common.name);
+	UDS_FREE(queue);
+}
+
+static void free_round_robin_work_queue(struct round_robin_work_queue *queue)
+{
+	struct simple_work_queue **queue_table = queue->service_queues;
+	unsigned int count = queue->num_service_queues;
+	unsigned int i;
+
+	queue->service_queues = NULL;
+
+	for (i = 0; i < count; i++)
+		free_simple_work_queue(queue_table[i]);
+	UDS_FREE(queue_table);
+	UDS_FREE(queue->common.name);
+	UDS_FREE(queue);
+}
+
+void vdo_free_work_queue(struct vdo_work_queue *queue)
+{
+	if (queue == NULL)
+		return;
+
+	vdo_finish_work_queue(queue);
+
+	if (queue->round_robin_mode)
+		free_round_robin_work_queue(as_round_robin_work_queue(queue));
+	else
+		free_simple_work_queue(as_simple_work_queue(queue));
+}
+
+static int make_simple_work_queue(const char *thread_name_prefix,
+				  const char *name,
+				  struct vdo_thread *owner,
+				  void *private,
+				  const struct vdo_work_queue_type *type,
+				  struct simple_work_queue **queue_ptr)
+{
+	DECLARE_COMPLETION_ONSTACK(started);
+	struct simple_work_queue *queue;
+	int i;
+	struct task_struct *thread = NULL;
+	int result;
+
+	ASSERT_LOG_ONLY((type->max_priority <= VDO_WORK_Q_MAX_PRIORITY),
+			"queue priority count %u within limit %u",
+			type->max_priority,
+			VDO_WORK_Q_MAX_PRIORITY);
+
+	result = UDS_ALLOCATE(1, struct simple_work_queue, "simple work queue", &queue);
+	if (result != UDS_SUCCESS)
+		return result;
+
+	queue->private = private;
+	queue->started = &started;
+	queue->common.type = type;
+	queue->common.owner = owner;
+	init_waitqueue_head(&queue->waiting_worker_threads);
+
+	result = uds_duplicate_string(name, "queue name", &queue->common.name);
+	if (result != VDO_SUCCESS) {
+		UDS_FREE(queue);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i <= type->max_priority; i++) {
+		result = uds_make_funnel_queue(&queue->priority_lists[i]);
+		if (result != UDS_SUCCESS) {
+			free_simple_work_queue(queue);
+			return result;
+		}
+	}
+
+	thread = kthread_run(work_queue_runner,
+			     queue,
+			     "%s:%s",
+			     thread_name_prefix,
+			     queue->common.name);
+	if (IS_ERR(thread)) {
+		free_simple_work_queue(queue);
+		return (int) PTR_ERR(thread);
+	}
+
+	queue->thread = thread;
+
+	/*
+	 * If we don't wait to ensure the thread is running VDO code, a quick kthread_stop (due to
+	 * errors elsewhere) could cause it to never get as far as running VDO, skipping the
+	 * cleanup code.
+	 *
+	 * Eventually we should just make that path safe too, and then we won't need this
+	 * synchronization.
+	 */
+	wait_for_completion(&started);
+
+	*queue_ptr = queue;
+	return UDS_SUCCESS;
+}
+
+/**
+ * Create a work queue; if multiple threads are requested, completions will be distributed to them
+ * in round-robin fashion.
+ *
+ * Each queue is associated with a struct vdo_thread which has a single vdo thread id. Regardless
+ * of the actual number of queues and threads allocated here, code outside of the queue
+ * implementation will treat this as a single zone.
+ */
+int vdo_make_work_queue(const char *thread_name_prefix,
+			const char *name,
+			struct vdo_thread *owner,
+			const struct vdo_work_queue_type *type,
+			unsigned int thread_count,
+			void *thread_privates[],
+			struct vdo_work_queue **queue_ptr)
+{
+	struct round_robin_work_queue *queue;
+	int result;
+	char thread_name[TASK_COMM_LEN];
+	unsigned int i;
+
+	if (thread_count == 1) {
+		struct simple_work_queue *simple_queue;
+		void *context = ((thread_privates != NULL) ? thread_privates[0] : NULL);
+
+		result = make_simple_work_queue(thread_name_prefix,
+						name,
+						owner,
+						context,
+						type,
+						&simple_queue);
+		if (result == VDO_SUCCESS)
+			*queue_ptr = &simple_queue->common;
+		return result;
+	}
+
+	result = UDS_ALLOCATE(1, struct round_robin_work_queue, "round-robin work queue", &queue);
+	if (result != UDS_SUCCESS)
+		return result;
+
+	result = UDS_ALLOCATE(thread_count,
+			      struct simple_work_queue *,
+			      "subordinate work queues",
+			      &queue->service_queues);
+	if (result != UDS_SUCCESS) {
+		UDS_FREE(queue);
+		return result;
+	}
+
+	queue->num_service_queues = thread_count;
+	queue->common.round_robin_mode = true;
+	queue->common.owner = owner;
+
+	result = uds_duplicate_string(name, "queue name", &queue->common.name);
+	if (result != VDO_SUCCESS) {
+		UDS_FREE(queue->service_queues);
+		UDS_FREE(queue);
+		return -ENOMEM;
+	}
+
+	*queue_ptr = &queue->common;
+
+	for (i = 0; i < thread_count; i++) {
+		void *context = ((thread_privates != NULL) ? thread_privates[i] : NULL);
+
+		snprintf(thread_name, sizeof(thread_name), "%s%u", name, i);
+		result = make_simple_work_queue(thread_name_prefix,
+						thread_name,
+						owner,
+						context,
+						type,
+						&queue->service_queues[i]);
+		if (result != VDO_SUCCESS) {
+			queue->num_service_queues = i;
+			/* Destroy previously created subordinates. */
+			vdo_free_work_queue(UDS_FORGET(*queue_ptr));
+			return result;
+		}
+	}
+
+	return VDO_SUCCESS;
+}
+
+static void finish_simple_work_queue(struct simple_work_queue *queue)
+{
+	if (queue->thread == NULL)
+		return;
+
+	/* Tells the worker thread to shut down and waits for it to exit. */
+	kthread_stop(queue->thread);
+	queue->thread = NULL;
+}
+
+static void finish_round_robin_work_queue(struct round_robin_work_queue *queue)
+{
+	struct simple_work_queue **queue_table = queue->service_queues;
+	unsigned int count = queue->num_service_queues;
+	unsigned int i;
+
+	for (i = 0; i < count; i++)
+		finish_simple_work_queue(queue_table[i]);
+}
+
+/* No enqueueing of completions should be done once this function is called. */
+void vdo_finish_work_queue(struct vdo_work_queue *queue)
+{
+	if (queue == NULL)
+		return;
+
+	if (queue->round_robin_mode)
+		finish_round_robin_work_queue(as_round_robin_work_queue(queue));
+	else
+		finish_simple_work_queue(as_simple_work_queue(queue));
+}
+
+/* Debugging dumps */
+
+static void dump_simple_work_queue(struct simple_work_queue *queue)
+{
+	const char *thread_status = "no threads";
+	char task_state_report = '-';
+
+	if (queue->thread != NULL) {
+		task_state_report = task_state_to_char(queue->thread);
+		thread_status = atomic_read(&queue->idle) ? "idle" : "running";
+	}
+
+	uds_log_info("workQ %px (%s) %s (%c)",
+		     &queue->common,
+		     queue->common.name,
+		     thread_status,
+		     task_state_report);
+
+	/* ->waiting_worker_threads wait queue status? anyone waiting? */
+}
+
+/**
+ * Write to the buffer some info about the completion, for logging. Since the common use case is
+ * dumping info about a lot of completions to syslog all at once, the format favors brevity over
+ * readability.
+ */
+void vdo_dump_work_queue(struct vdo_work_queue *queue)
+{
+	if (queue->round_robin_mode) {
+		struct round_robin_work_queue *round_robin = as_round_robin_work_queue(queue);
+		unsigned int i;
+
+		for (i = 0; i < round_robin->num_service_queues; i++)
+			dump_simple_work_queue(round_robin->service_queues[i]);
+	} else {
+		dump_simple_work_queue(as_simple_work_queue(queue));
+	}
+}
+
+static void get_function_name(void *pointer, char *buffer, size_t buffer_length)
+{
+	if (pointer == NULL) {
+		/*
+		 * Format "%ps" logs a null pointer as "(null)" with a bunch of leading spaces. We
+		 * sometimes use this when logging lots of data; don't be so verbose.
+		 */
+		strncpy(buffer, "-", buffer_length);
+	} else {
+		/*
+		 * Use a pragma to defeat gcc's format checking, which doesn't understand that
+		 * "%ps" actually does support a precision spec in Linux kernel code.
+		 */
+		char *space;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wformat"
+		snprintf(buffer, buffer_length, "%.*ps", buffer_length - 1, pointer);
+#pragma GCC diagnostic pop
+
+		space = strchr(buffer, ' ');
+		if (space != NULL)
+			*space = '\0';
+	}
+}
+
+void vdo_dump_completion_to_buffer(struct vdo_completion *completion, char *buffer, size_t length)
+{
+	size_t current_length =
+		scnprintf(buffer,
+			  length,
+			  "%.*s/",
+			  TASK_COMM_LEN,
+			  (completion->my_queue == NULL ? "-" : completion->my_queue->name));
+
+	if (current_length < length)
+		get_function_name((void *) completion->callback,
+				  buffer + current_length,
+				  length - current_length);
+}
+
+/* Completion submission */
+/*
+ * If the completion has a timeout that has already passed, the timeout handler function may be
+ * invoked by this function.
+ */
+void vdo_enqueue_work_queue(struct vdo_work_queue *queue, struct vdo_completion *completion)
+{
+	/*
+	 * Convert the provided generic vdo_work_queue to the simple_work_queue to actually queue
+	 * on.
+	 */
+	struct simple_work_queue *simple_queue = NULL;
+
+	if (!queue->round_robin_mode) {
+		simple_queue = as_simple_work_queue(queue);
+	} else {
+		struct round_robin_work_queue *round_robin = as_round_robin_work_queue(queue);
+
+		/*
+		 * It shouldn't be a big deal if the same rotor gets used for multiple work queues.
+		 * Any patterns that might develop are likely to be disrupted by random ordering of
+		 * multiple completions and migration between cores, unless the load is so light as
+		 * to be regular in ordering of tasks and the threads are confined to individual
+		 * cores; with a load that light we won't care.
+		 */
+		unsigned int rotor = this_cpu_inc_return(service_queue_rotor);
+		unsigned int index = rotor % round_robin->num_service_queues;
+
+		simple_queue = round_robin->service_queues[index];
+	}
+
+	enqueue_work_queue_completion(simple_queue, completion);
+}
+
+/* Misc */
+
+/*
+ * Return the work queue pointer recorded at initialization time in the work-queue stack handle
+ * initialized on the stack of the current thread, if any.
+ */
+static struct simple_work_queue *get_current_thread_work_queue(void)
+{
+	/*
+	 * In interrupt context, if a vdo thread is what got interrupted, the calls below will find
+	 * the queue for the thread which was interrupted. However, the interrupted thread may have
+	 * been processing a completion, in which case starting to process another would violate
+	 * our concurrency assumptions.
+	 */
+	if (in_interrupt())
+		return NULL;
+	if (kthread_func(current) != work_queue_runner)
+		/* Not a VDO work queue thread. */
+		return NULL;
+	return kthread_data(current);
+}
+
+struct vdo_work_queue *vdo_get_current_work_queue(void)
+{
+	struct simple_work_queue *queue = get_current_thread_work_queue();
+
+	return (queue == NULL) ? NULL : &queue->common;
+}
+
+struct vdo_thread *vdo_get_work_queue_owner(struct vdo_work_queue *queue)
+{
+	return queue->owner;
+}
+
+/**
+ * Returns the private data for the current thread's work queue, or NULL if none or if the current
+ * thread is not a work queue thread.
+ */
+void *vdo_get_work_queue_private_data(void)
+{
+	struct simple_work_queue *queue = get_current_thread_work_queue();
+
+	return (queue != NULL) ? queue->private : NULL;
+}
+
+bool vdo_work_queue_type_is(struct vdo_work_queue *queue, const struct vdo_work_queue_type *type)
+{
+	return (queue->type == type);
+}
diff --git a/drivers/md/dm-vdo/work-queue.h b/drivers/md/dm-vdo/work-queue.h
new file mode 100644
index 00000000000..d1e05f8901d
--- /dev/null
+++ b/drivers/md/dm-vdo/work-queue.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright Red Hat
+ */
+
+#ifndef VDO_WORK_QUEUE_H
+#define VDO_WORK_QUEUE_H
+
+#include <linux/sched.h> /* for TASK_COMM_LEN */
+
+#include "types.h"
+
+enum {
+	MAX_VDO_WORK_QUEUE_NAME_LEN = TASK_COMM_LEN,
+};
+
+struct vdo_work_queue_type {
+	void (*start)(void *context);
+	void (*finish)(void *context);
+	enum vdo_completion_priority max_priority;
+	enum vdo_completion_priority default_priority;
+};
+
+struct vdo_completion;
+struct vdo_thread;
+struct vdo_work_queue;
+
+int vdo_make_work_queue(const char *thread_name_prefix,
+			const char *name,
+			struct vdo_thread *owner,
+			const struct vdo_work_queue_type *type,
+			unsigned int thread_count,
+			void *thread_privates[],
+			struct vdo_work_queue **queue_ptr);
+
+void vdo_enqueue_work_queue(struct vdo_work_queue *queue, struct vdo_completion *completion);
+
+void vdo_finish_work_queue(struct vdo_work_queue *queue);
+
+void vdo_free_work_queue(struct vdo_work_queue *queue);
+
+void vdo_dump_work_queue(struct vdo_work_queue *queue);
+
+void vdo_dump_completion_to_buffer(struct vdo_completion *completion, char *buffer, size_t length);
+
+void *vdo_get_work_queue_private_data(void);
+struct vdo_work_queue *vdo_get_current_work_queue(void);
+struct vdo_thread *vdo_get_work_queue_owner(struct vdo_work_queue *queue);
+
+bool __must_check
+vdo_work_queue_type_is(struct vdo_work_queue *queue, const struct vdo_work_queue_type *type);
+
+#endif /* VDO_WORK_QUEUE_H */
-- 
2.40.0

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel