All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] ndctl: add a BTT check utility
@ 2017-01-20  3:12 Vishal Verma
  2017-01-20 20:04 ` Dan Williams
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Vishal Verma @ 2017-01-20  3:12 UTC (permalink / raw)
  To: linux-nvdimm

Add the check-namespace command to ndctl. This will check the BTT
metadata layout for the given namespace, and if requested, correct any
errors found. Not all metadata corruption is detectable or fixable.

Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 Documentation/Makefile.am               |   1 +
 Documentation/namespace-check.txt       |   7 +
 Documentation/ndctl-check-namespace.txt |  46 ++
 Documentation/ndctl.txt                 |   1 +
 builtin.h                               |   3 +
 configure.ac                            |  10 +
 contrib/ndctl                           |   3 +
 ndctl/btt-structs.h                     | 117 ++++
 ndctl/builtin-xaction-namespace.c       | 910 +++++++++++++++++++++++++++++++-
 ndctl/ndctl.c                           |   3 +
 util/util.h                             |   8 +
 11 files changed, 1107 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/namespace-check.txt
 create mode 100644 Documentation/ndctl-check-namespace.txt
 create mode 100644 ndctl/btt-structs.h

diff --git a/Documentation/Makefile.am b/Documentation/Makefile.am
index adcc9e7..92790b6 100644
--- a/Documentation/Makefile.am
+++ b/Documentation/Makefile.am
@@ -12,6 +12,7 @@ man1_MANS = \
 	ndctl-disable-namespace.1 \
 	ndctl-create-namespace.1 \
 	ndctl-destroy-namespace.1 \
+	ndctl-check-namespace.1 \
 	ndctl-list.1
 
 CLEANFILES = $(man1_MANS)
diff --git a/Documentation/namespace-check.txt b/Documentation/namespace-check.txt
new file mode 100644
index 0000000..d304e83
--- /dev/null
+++ b/Documentation/namespace-check.txt
@@ -0,0 +1,7 @@
+DESCRIPTION
+-----------
+
+A namespace in the 'sector' mode will have metadata on it to describe
+the kernel BTT (Block Translation Table). The check-namespace command
+can be used to check the consistency of this metadata, and also attempt
+to repair it, if it has enough information to do so.
diff --git a/Documentation/ndctl-check-namespace.txt b/Documentation/ndctl-check-namespace.txt
new file mode 100644
index 0000000..18e44fc
--- /dev/null
+++ b/Documentation/ndctl-check-namespace.txt
@@ -0,0 +1,46 @@
+ndctl-check-namespace(1)
+=========================
+
+NAME
+----
+ndctl-check-namespace - check namespace metadata consistency
+
+SYNOPSIS
+--------
+[verse]
+'ndctl check-namespace' [<options>]
+
+include::namespace-check.txt[]
+
+EXAMPLES
+--------
+
+Check a BTT namespace
+[verse]
+ndctl disable-namespace namespace0.0
+ndctl check-namespace namespace0.0
+
+Check a BTT namespace, but don't actually restore/correct anything
+[verse]
+ndctl check-namespace --dry-run
+
+OPTIONS
+-------
+-n::
+--dry-run::
+	Only check the metadata for consistency, don't write anything.
+
+-v::
+--verbose::
+	Emit debug messages for the namespace creation process
+
+-r::
+--region=::
+include::xable-region-options.txt[]
+
+SEE ALSO
+--------
+linkndctl:ndctl-disable-namespace[1],
+linkndctl:ndctl-enable-namespace[1],
+http://pmem.io/documents/NVDIMM_Namespace_Spec.pdf[NVDIMM Namespace
+Specification]
diff --git a/Documentation/ndctl.txt b/Documentation/ndctl.txt
index 883a59c..c26cc2f 100644
--- a/Documentation/ndctl.txt
+++ b/Documentation/ndctl.txt
@@ -34,6 +34,7 @@ SEE ALSO
 --------
 linkndctl:ndctl-create-namespace[1],
 linkndctl:ndctl-destroy-namespace[1],
+linkndctl:ndctl-check-namespace[1],
 linkndctl:ndctl-enable-region[1],
 linkndctl:ndctl-disable-region[1],
 linkndctl:ndctl-enable-dimm[1],
diff --git a/builtin.h b/builtin.h
index 9b66196..b817ced 100644
--- a/builtin.h
+++ b/builtin.h
@@ -13,6 +13,9 @@ int cmd_enable_namespace(int argc, const char **argv, void *ctx);
 int cmd_create_namespace(int argc, const char **argv, void *ctx);
 int cmd_destroy_namespace(int argc, const char **argv, void *ctx);
 int cmd_disable_namespace(int argc, const char **argv, void *ctx);
+#ifdef ENABLE_CHECK_NAMESPACE
+int cmd_check_namespace(int argc, const char **argv, struct *ctx);
+#endif
 int cmd_enable_region(int argc, const char **argv, void *ctx);
 int cmd_disable_region(int argc, const char **argv, void *ctx);
 int cmd_enable_dimm(int argc, const char **argv, void *ctx);
diff --git a/configure.ac b/configure.ac
index e79623a..92a0f06 100644
--- a/configure.ac
+++ b/configure.ac
@@ -111,6 +111,16 @@ fi
 AC_SUBST([BASH_COMPLETION_DIR])
 AM_CONDITIONAL([ENABLE_BASH_COMPLETION],[test "x$with_bash_completion_dir" != "xno"])
 
+AC_ARG_ENABLE([check-namespace],
+        AS_HELP_STRING([--disable-check-namespace],
+		[disable the 'check namspace' command @<:@default=system@:>@]),
+	[], [enable_check_namespace=yes])
+
+AM_CONDITIONAL([ENABLE_CHECK_NAMESPACE], [test "x$enable_check_namespace" = "xyes"])
+AS_IF([test "x$enable_check_namespace" = "xyes"], [
+	AC_DEFINE([ENABLE_CHECK_NAMESPACE], [1], [check namespace support])
+])
+
 AC_ARG_ENABLE([local],
         AS_HELP_STRING([--disable-local], [build against kernel ndctl.h @<:@default=system@:>@]),
         [], [enable_local=yes])
diff --git a/contrib/ndctl b/contrib/ndctl
index ea7303c..c97adcc 100755
--- a/contrib/ndctl
+++ b/contrib/ndctl
@@ -194,6 +194,9 @@ __ndctl_comp_non_option_args()
 	destroy-namespace)
 		opts="$(__ndctl_get_ns) all"
 		;;
+	check-namespace)
+		opts="$(__ndctl_get_ns -i) all"
+		;;
 	enable-region)
 		opts="$(__ndctl_get_regions -i) all"
 		;;
diff --git a/ndctl/btt-structs.h b/ndctl/btt-structs.h
new file mode 100644
index 0000000..1329668
--- /dev/null
+++ b/ndctl/btt-structs.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _BTT_STRUCTS_H
+#define _BTT_STRUCTS_H
+
+#include <linux/types.h>
+
+#define BTT_SIG_LEN 16
+#define BTT_SIG "BTT_ARENA_INFO\0"
+#define MAP_ENT_SIZE 4
+#define MAP_TRIM_SHIFT 31
+#define MAP_TRIM_MASK (1 << MAP_TRIM_SHIFT)
+#define MAP_ERR_SHIFT 30
+#define MAP_ERR_MASK (1 << MAP_ERR_SHIFT)
+#define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT)))
+#define MAP_ENT_NORMAL 0xC0000000
+#define LOG_ENT_SIZE sizeof(struct log_entry)
+#define ARENA_MIN_SIZE (1UL << 24)	/* 16 MB */
+#define ARENA_MAX_SIZE (1ULL << 39)	/* 512 GB */
+#define BTT_PG_SIZE 4096
+#define BTT_DEFAULT_NFREE 256
+#define LOG_SEQ_INIT 1
+
+#define IB_FLAG_ERROR 0x00000001
+#define IB_FLAG_ERROR_MASK 0x00000001
+
+struct log_entry {
+	__le32 lba;
+	__le32 old_map;
+	__le32 new_map;
+	__le32 seq;
+	__le64 padding[2];
+};
+
+struct btt_sb {
+	__u8 signature[BTT_SIG_LEN];
+	__u8 uuid[16];
+	__u8 parent_uuid[16];
+	__le32 flags;
+	__le16 version_major;
+	__le16 version_minor;
+	__le32 external_lbasize;
+	__le32 external_nlba;
+	__le32 internal_lbasize;
+	__le32 internal_nlba;
+	__le32 nfree;
+	__le32 infosize;
+	__le64 nextoff;
+	__le64 dataoff;
+	__le64 mapoff;
+	__le64 logoff;
+	__le64 info2off;
+	__u8 padding[3968];
+	__le64 checksum;
+};
+
+struct free_entry {
+	__u32 block;
+	__u8 sub;
+	__u8 seq;
+};
+
+struct arena_map {
+	struct btt_sb *info;
+	size_t info_len;
+	void *data;
+	size_t data_len;
+	__u32 *map;
+	size_t map_len;
+	struct log_entry *log;
+	size_t log_len;
+	struct btt_sb *info2;
+	size_t info2_len;
+};
+
+struct arena_info {
+	struct arena_map map;
+	__u64 size;	/* Total bytes for this arena */
+	__u64 external_lba_start;
+	__u32 internal_nlba;
+	__u32 internal_lbasize;
+	__u32 external_nlba;
+	__u32 external_lbasize;
+	__u32 nfree;
+	__u16 version_major;
+	__u16 version_minor;
+	__u64 nextoff;
+	__u64 infooff;
+	__u64 dataoff;
+	__u64 mapoff;
+	__u64 logoff;
+	__u64 info2off;
+	__u32 flags;
+	int num;
+};
+
+struct btt_chk {
+	char *path;
+	uuid_t parent_uuid;
+	unsigned long long rawsize;
+	unsigned long long nlba;
+	int num_arenas;
+	struct arena_info *arena;
+};
+
+#endif
diff --git a/ndctl/builtin-xaction-namespace.c b/ndctl/builtin-xaction-namespace.c
index 2c4f85f..499670d 100644
--- a/ndctl/builtin-xaction-namespace.c
+++ b/ndctl/builtin-xaction-namespace.c
@@ -13,20 +13,25 @@
 #include <stdio.h>
 #include <fcntl.h>
 #include <errno.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <limits.h>
 #include <syslog.h>
+#include <endian.h>
+#include <sys/mman.h>
 #include <sys/stat.h>
 #include <uuid/uuid.h>
 #include <sys/types.h>
 #include <util/size.h>
 #include <util/json.h>
+#include <util/util.h>
 #include <json-c/json.h>
 #include <util/filter.h>
 #include <ndctl/libndctl.h>
 #include <util/parse-options.h>
 #include <ccan/array_size/array_size.h>
+#include "btt-structs.h"
 
 #ifdef HAVE_NDCTL_H
 #include <linux/ndctl.h>
@@ -34,8 +39,11 @@
 #include <ndctl.h>
 #endif
 
+#define SZ_4K 0x1000UL
+
 static bool verbose;
 static bool force;
+static bool dryrun;
 static struct parameters {
 	bool do_scan;
 	bool mode_default;
@@ -106,6 +114,9 @@ OPT_STRING('t', "type", &param.type, "type", \
 	"specify the type of namespace to create 'pmem' or 'blk'"), \
 OPT_BOOLEAN('f', "force", &force, "reconfigure namespace even if currently active")
 
+#define CHECK_OPTIONS() \
+OPT_BOOLEAN('n', "dry-run", &dryrun, "dry-run only, don't write anything")
+
 static const struct option base_options[] = {
 	BASE_OPTIONS(),
 	OPT_END(),
@@ -124,11 +135,18 @@ static const struct option create_options[] = {
 	OPT_END(),
 };
 
+static const struct option check_options[] = {
+	BASE_OPTIONS(),
+	CHECK_OPTIONS(),
+	OPT_END(),
+};
+
 enum namespace_action {
 	ACTION_ENABLE,
 	ACTION_DISABLE,
 	ACTION_CREATE,
 	ACTION_DESTROY,
+	ACTION_CHECK,
 };
 
 static int set_defaults(enum namespace_action mode)
@@ -253,8 +271,23 @@ static const char *parse_namespace_options(int argc, const char **argv,
 	rc = set_defaults(mode);
 
 	if (argc == 0 && mode != ACTION_CREATE) {
-		error("specify a namespace to %s, or \"all\"\n",
-				mode == ACTION_ENABLE ? "enable" : "disable");
+		char *action_string;
+
+		switch (mode) {
+			case ACTION_ENABLE:
+				action_string = "enable";
+				break;
+			case ACTION_DISABLE:
+				action_string = "disable";
+				break;
+			case ACTION_CHECK:
+				action_string = "check";
+				break;
+			default:
+				action_string = "<>";
+				break;
+		}
+		error("specify a namespace to %s, or \"all\"\n", action_string);
 		rc = -EINVAL;
 	}
 	for (i = mode == ACTION_CREATE ? 0 : 1; i < argc; i++) {
@@ -703,6 +736,854 @@ static int namespace_reconfig(struct ndctl_region *region,
 	return setup_namespace(region, ndns, &p);
 }
 
+#ifdef ENABLE_CHECK_NAMESPACE
+
+static int dryrun_msg(void)
+{
+	error("  Run without --dry-run to make the changes");
+	return 0;
+}
+
+static int btt_read_info(struct btt_chk *bttc, struct btt_sb *btt_sb, __u64 off)
+{
+	int fd, rc = 0;
+	ssize_t size;
+
+	fd = open(bttc->path, O_RDONLY);
+	if (fd < 0) {
+		error("unable to open %s: %s\n", bttc->path, strerror(errno));
+		return -errno;
+	}
+
+	size = pread(fd, btt_sb, sizeof(*btt_sb), off + SZ_4K);
+	if (size != sizeof(*btt_sb)) {
+		error("unable to read first info block: %s\n", strerror(errno));
+		rc = -errno;
+	}
+	close(fd);
+	return rc;
+}
+
+static int btt_write_info(struct btt_chk *bttc, struct btt_sb *btt_sb, __u64 off)
+{
+	int fd, rc = 0;
+	ssize_t size;
+
+	if (dryrun) {
+		error("BTT info block at offset %#llx needs to be restored\n", off);
+		dryrun_msg();
+		return -1;
+	}
+	printf("Restoring BTT info block at offset %#llx\n", off);
+
+	fd = open(bttc->path, O_RDWR);
+	if (fd < 0) {
+		error("unable to open %s: %s\n", bttc->path, strerror(errno));
+		return -errno;
+	}
+
+	size = pwrite(fd, btt_sb, sizeof(*btt_sb), off + SZ_4K);
+	if (size != sizeof(*btt_sb)) {
+		error("unable to write the info block: %s\n", strerror(errno));
+		rc = -errno;
+	}
+	close(fd);
+	return rc;
+}
+
+static int btt_copy_to_info2(struct arena_info *a)
+{
+	if (dryrun) {
+		error("Arena %d: BTT info2 needs to be restored\n", a->num);
+		return dryrun_msg();
+	}
+	printf("Arena %d: Restoring BTT info2\n", a->num);
+	memcpy(a->map.info2, a->map.info, BTT_PG_SIZE);
+	return 0;
+}
+static int btt_map_read(struct arena_info *a, __u32 lba, __u32 *mapping,
+			int *trim, int *error)
+{
+	__u32 raw_mapping, postmap, ze, z_flag, e_flag;
+
+	raw_mapping = le32toh(a->map.map[lba]);
+	z_flag = (raw_mapping & MAP_TRIM_MASK) >> MAP_TRIM_SHIFT;
+	e_flag = (raw_mapping & MAP_ERR_MASK) >> MAP_ERR_SHIFT;
+	ze = (z_flag << 1) + e_flag;
+	postmap = raw_mapping & MAP_LBA_MASK;
+
+	/* Reuse the {z,e}_flag variables for *trim and *error */
+	z_flag = 0;
+	e_flag = 0;
+
+	switch (ze) {
+	case 0:
+		/* Initial state. Return postmap = premap */
+		*mapping = lba;
+		break;
+	case 1:
+		*mapping = postmap;
+		e_flag = 1;
+		break;
+	case 2:
+		*mapping = postmap;
+		z_flag = 1;
+		break;
+	case 3:
+		*mapping = postmap;
+		break;
+	default:
+		return -EIO;
+	}
+
+	if (trim)
+		*trim = z_flag;
+	if (error)
+		*error = e_flag;
+
+	return 0;
+}
+
+static int btt_map_write(struct arena_info *a, __u32 lba, __u32 mapping,
+			__u32 z_flag, __u32 e_flag)
+{
+	__u32 ze;
+
+	/*
+	 * This 'mapping' is supposed to be just the LBA mapping, without
+	 * any flags set, so strip the flag bits.
+	 */
+	mapping &= MAP_LBA_MASK;
+
+	if (dryrun) {
+		error("Arena %d: map[%#x] needs to be updated to %#x\n",
+			a->num, lba, mapping);
+		return dryrun_msg();
+	}
+	printf("Arena %d: Updating map[%#x] to %#x\n", a->num, lba, mapping);
+
+	ze = (z_flag << 1) + e_flag;
+	switch (ze) {
+	case 0:
+		/*
+		 * We want to set neither of the Z or E flags, and
+		 * in the actual layout, this means setting the bit
+		 * positions of both to '1' to indicate a 'normal'
+		 * map entry
+		 */
+		mapping |= MAP_ENT_NORMAL;
+		break;
+	case 1:
+		mapping |= (1 << MAP_ERR_SHIFT);
+		break;
+	case 2:
+		mapping |= (1 << MAP_TRIM_SHIFT);
+		break;
+	default:
+		/*
+		 * The case where Z and E are both sent in as '1' could be
+		 * construed as a valid 'normal' case, but we decide not to,
+		 * to avoid confusion
+		 */
+		error("%s: Invalid use of Z and E flags\n", __func__);
+		return -ENXIO;
+	}
+
+	a->map.map[lba] = htole32(mapping);
+	if (msync((void *)rounddown((__u64)&a->map.map[lba], BTT_PG_SIZE),
+		BTT_PG_SIZE, MS_SYNC) < 0)
+		return errno;
+
+	return 0;
+}
+
+static void btt_log_read_pair(struct arena_info *a, __u32 lane,
+			struct log_entry *ent)
+{
+	memcpy(ent, &a->map.log[lane * 2], 2 * sizeof(struct log_entry));
+}
+
+/*
+ * This function accepts two log entries, and uses the sequence number to
+ * find the 'older' entry. The return value indicates which of the two was
+ * the 'old' entry
+ */
+static int btt_log_get_old(struct log_entry *ent)
+{
+	int old;
+
+	if (ent[0].seq == 0) {
+		ent[0].seq = htole32(1);
+		return 0;
+	}
+
+	if (le32toh(ent[0].seq) < le32toh(ent[1].seq)) {
+		if (le32toh(ent[1].seq) - le32toh(ent[0].seq) == 1)
+			old = 0;
+		else
+			old = 1;
+	} else {
+		if (le32toh(ent[0].seq) - le32toh(ent[1].seq) == 1)
+			old = 1;
+		else
+			old = 0;
+	}
+
+	return old;
+}
+
+static int btt_log_read(struct arena_info *a, __u32 lane, struct log_entry *ent)
+{
+	int new_ent;
+	struct log_entry log[2];
+
+	if (ent == NULL)
+		return -EINVAL;
+	btt_log_read_pair(a, lane, log);
+	new_ent = 1 - btt_log_get_old(log);
+	memcpy(ent, &log[new_ent], sizeof(struct log_entry));
+	return 0;
+}
+
+static uint64_t fletcher64(void *addr, size_t len, bool le)
+{
+	uint32_t *buf = addr;
+	uint32_t lo32 = 0;
+	uint64_t hi32 = 0;
+	unsigned int i;
+
+	for (i = 0; i < len / sizeof(uint32_t); i++) {
+		lo32 += le ? le32toh((__le32) buf[i]) : buf[i];
+		hi32 += lo32;
+	}
+
+	return ((uint64_t)(hi32 << 32)) | lo32;
+}
+
+static int btt_checksum_verify(struct btt_sb *btt_sb)
+{
+	uint64_t sum;
+	__le64 sum_save;
+
+	BUILD_BUG_ON(sizeof(struct btt_sb) != SZ_4K);
+
+	sum_save = btt_sb->checksum;
+	btt_sb->checksum = 0;
+	sum = fletcher64(btt_sb, sizeof(*btt_sb), 1);
+	if (sum != sum_save)
+		return 1;
+	/* restore the checksum in the buffer */
+	btt_sb->checksum = sum_save;
+
+	return 0;
+}
+
+/*
+ * Never pass a mmapped buffer to this as it will attempt to write to
+ * the buffer, and we want writes to only happened in a controlled fashion.
+ * In the --dry-run case, even if such a buffer is passed, the write will
+ * result in a fault due to the readonly mmap flags.
+ */
+static int btt_info_verify(struct btt_chk *bttc, struct btt_sb *btt_sb)
+{
+	if (memcmp(btt_sb->signature, BTT_SIG, BTT_SIG_LEN) != 0)
+		return -ENXIO;
+
+	if (!uuid_is_null(btt_sb->parent_uuid))
+		if (uuid_compare(bttc->parent_uuid, btt_sb->parent_uuid) != 0)
+			return -ENXIO;
+
+	if (btt_checksum_verify(btt_sb))
+		return -ENXIO;
+
+	return 0;
+}
+
+static void btt_parse_meta(struct arena_info *arena, struct btt_sb *btt_sb,
+				__u64 arena_off)
+{
+	arena->internal_nlba = le32toh(btt_sb->internal_nlba);
+	arena->internal_lbasize = le32toh(btt_sb->internal_lbasize);
+	arena->external_nlba = le32toh(btt_sb->external_nlba);
+	arena->external_lbasize = le32toh(btt_sb->external_lbasize);
+	arena->nfree = le32toh(btt_sb->nfree);
+	arena->version_major = le16toh(btt_sb->version_major);
+	arena->version_minor = le16toh(btt_sb->version_minor);
+
+	arena->nextoff = (btt_sb->nextoff == 0) ? 0 : (arena_off +
+			le64toh(btt_sb->nextoff));
+	arena->infooff = arena_off;
+	arena->dataoff = arena_off + le64toh(btt_sb->dataoff);
+	arena->mapoff = arena_off + le64toh(btt_sb->mapoff);
+	arena->logoff = arena_off + le64toh(btt_sb->logoff);
+	arena->info2off = arena_off + le64toh(btt_sb->info2off);
+
+	arena->size = (le64toh(btt_sb->nextoff) > 0)
+		? (le64toh(btt_sb->nextoff))
+		: (arena->info2off - arena->infooff + BTT_PG_SIZE);
+
+	arena->flags = le32toh(btt_sb->flags);
+}
+
+static int btt_discover_arenas(struct btt_chk *bttc)
+{
+	int ret = 0;
+	struct arena_info *arena;
+	struct btt_sb *btt_sb;
+	size_t remaining = bttc->rawsize;
+	__u64 cur_nlba = 0;
+	size_t cur_off = 0;
+	int  i = 0;
+
+	btt_sb = calloc(1, sizeof(*btt_sb));
+	if (!btt_sb)
+		return -ENOMEM;
+
+	while (remaining) {
+		/* Alloc memory for arena */
+		arena = realloc(bttc->arena, (i + 1) * sizeof(*arena));
+		if (!arena) {
+			ret = -ENOMEM;
+			goto out;
+		} else {
+			bttc->arena = arena;
+			arena = &bttc->arena[i];
+			/* zero the new memory */
+			memset(arena, 0, sizeof(*arena));
+		}
+
+		arena->infooff = cur_off;
+		ret = btt_read_info(bttc, btt_sb, cur_off);
+		if (ret)
+			goto out;
+
+		if (btt_info_verify(bttc, btt_sb) != 0) {
+			__u64 offset;
+
+			/* Try to find the backup info block */
+			if (remaining <= ARENA_MAX_SIZE)
+				offset = rounddown(bttc->rawsize, BTT_PG_SIZE) -
+					2 * BTT_PG_SIZE;
+			else
+				offset = cur_off + ARENA_MAX_SIZE - BTT_PG_SIZE;
+
+			printf("Arena %d: Attempting recover info-block using info2\n", i);
+			ret = btt_read_info(bttc, btt_sb, offset);
+			if (ret) {
+				error("Unable to read backup info block (offset %lld)\n",
+					offset);
+				goto out;
+			}
+			ret = btt_info_verify(bttc, btt_sb);
+			if (ret) {
+				error("Backup info block (offset %lld) verification failed\n",
+					offset);
+				goto out;
+			}
+			ret = btt_write_info(bttc, btt_sb, cur_off);
+		}
+
+		arena->external_lba_start = cur_nlba;
+		btt_parse_meta(arena, btt_sb, cur_off);
+
+		remaining -= arena->size;
+		cur_off += arena->size;
+		cur_nlba += arena->external_nlba;
+		arena->num = i;
+		i++;
+
+		if (arena->nextoff == 0)
+			break;
+	}
+	bttc->num_arenas = i;
+	bttc->nlba = cur_nlba;
+	printf("found %d BTT arena%s\n", bttc->num_arenas,
+		(bttc->num_arenas > 1) ? "s" : "");
+	free(btt_sb);
+	return ret;
+
+ out:
+	free(bttc->arena);
+	free(btt_sb);
+	return ret;
+}
+
+static int btt_create_mappings(struct btt_chk *bttc)
+{
+	int open_flags, mmap_flags;
+	struct arena_info *a;
+	int fd, rc = 0, i;
+
+	if (dryrun) {
+		open_flags = O_RDONLY;
+		mmap_flags = PROT_READ;
+	} else {
+		open_flags = O_RDWR|O_EXCL;
+		mmap_flags = PROT_READ|PROT_WRITE;
+	}
+
+	fd = open(bttc->path, open_flags);
+	if (fd < 0) {
+		error("unable to open %s: %s\n", bttc->path, strerror(errno));
+		return -errno;
+	}
+
+	for (i = 0; i < bttc->num_arenas; i++) {
+		a = &bttc->arena[i];
+		a->map.info_len = BTT_PG_SIZE;
+		a->map.info = mmap(NULL, a->map.info_len, mmap_flags,
+			MAP_SHARED, fd, a->infooff + SZ_4K);
+		if (a->map.info == MAP_FAILED) {
+			rc = errno;
+			error("mmap arena[%d].info [sz = %#lx, off = %#llx] failed: %d\n",
+					i, a->map.info_len, a->infooff + SZ_4K, rc);
+			goto out;
+		}
+
+		a->map.data_len = a->mapoff - a->dataoff;
+		a->map.data = mmap(NULL, a->map.data_len, mmap_flags,
+			MAP_SHARED, fd, a->dataoff + SZ_4K);
+		if (a->map.data == MAP_FAILED) {
+			rc = errno;
+			error("mmap arena[%d].data [sz = %#lx, off = %#llx] failed: %d\n",
+					i, a->map.data_len, a->dataoff + SZ_4K, rc);
+			goto out;
+		}
+
+		a->map.map_len = a->logoff - a->mapoff;
+		a->map.map = mmap(NULL, a->map.map_len, mmap_flags,
+			MAP_SHARED, fd, a->mapoff + SZ_4K);
+		if (a->map.map == MAP_FAILED) {
+			rc = errno;
+			error("mmap arena[%d].map [sz = %#lx, off = %#llx] failed: %d\n",
+					i, a->map.map_len, a->mapoff + SZ_4K, rc);
+			goto out;
+		}
+
+		a->map.log_len = a->info2off - a->logoff;
+		a->map.log = mmap(NULL, a->map.log_len, mmap_flags,
+			MAP_SHARED, fd, a->logoff + SZ_4K);
+		if (a->map.log == MAP_FAILED) {
+			rc = errno;
+			error("mmap arena[%d].log [sz = %#lx, off = %#llx] failed: %d\n",
+					i, a->map.log_len, a->logoff + SZ_4K, rc);
+			goto out;
+		}
+
+		a->map.info2_len = BTT_PG_SIZE;
+		a->map.info2 = mmap(NULL, a->map.info2_len, mmap_flags,
+			MAP_SHARED, fd, a->info2off + SZ_4K);
+		if (a->map.info2 == MAP_FAILED) {
+			rc = errno;
+			error("mmap arena[%d].info2 [sz = %#lx, off = %#llx] failed: %d\n",
+					i, a->map.info2_len, a->info2off + SZ_4K, rc);
+			goto out;
+		}
+	}
+
+ out:
+	close(fd);
+	return rc;
+}
+
+static void btt_remove_mappings(struct btt_chk *bttc)
+{
+	struct arena_info *a;
+	int i;
+
+	for (i = 0; i < bttc->num_arenas; i++) {
+		a = &bttc->arena[i];
+		if (a->map.info)
+			munmap(a->map.info, a->map.info_len);
+		if (a->map.data)
+			munmap(a->map.data, a->map.data_len);
+		if (a->map.map)
+			munmap(a->map.map, a->map.map_len);
+		if (a->map.log)
+			munmap(a->map.log, a->map.log_len);
+		if (a->map.info2)
+			munmap(a->map.info2, a->map.info2_len);
+	}
+}
+
+enum btt_errcodes {
+	BTT_OK = 0,
+	BTT_LOG_EQL_SEQ = 0x100,
+	BTT_LOG_OOB_SEQ,
+	BTT_LOG_OOB_LBA,
+	BTT_LOG_OOB_OLD,
+	BTT_LOG_OOB_NEW,
+	BTT_LOG_MAP_ERR,
+	BTT_MAP_OOB,
+};
+
+static void btt_xlat_status(struct arena_info *a, int errcode)
+{
+	switch(errcode) {
+	case BTT_OK:
+		break;
+	case BTT_LOG_EQL_SEQ:
+		error("arena %d: found a pair of log entries with the same sequence number\n",
+			a->num);
+		break;
+	case BTT_LOG_OOB_SEQ:
+		error("arena %d: found a log entry with an out of bounds sequence number\n",
+			a->num);
+		break;
+	case BTT_LOG_OOB_LBA:
+		error("arena %d: found a log entry with an out of bounds LBA\n",
+			a->num);
+		break;
+	case BTT_LOG_OOB_OLD:
+		error("arena %d: found a log entry with an out of bounds 'old' mapping\n",
+			a->num);
+		break;
+	case BTT_LOG_OOB_NEW:
+		error("arena %d: found a log entry with an out of bounds 'new' mapping\n",
+			a->num);
+		break;
+	case BTT_LOG_MAP_ERR:
+		error("arena %d: found a log entry that does not match with a map entry\n",
+			a->num);
+		break;
+	case BTT_MAP_OOB:
+		error("arena %d: found a map entry that is out of bounds\n",
+			a->num);
+		break;
+	default:
+		error("arena %d: unknown error: %d\n", a->num, errcode);
+	}
+}
+
+/* Check that log entries are self consistent */
+static int btt_check_log_entries(struct arena_info *a)
+{
+	unsigned int i;
+	int rc = 0;
+
+	/*
+	 * First, check both 'slots' for sequence numbers being distinct
+	 * and in bounds
+	 */
+	for (i = 0; i < (2 * a->nfree); i+=2) {
+		if (a->map.log[i].seq == a->map.log[i + 1].seq)
+			return BTT_LOG_EQL_SEQ;
+		if (a->map.log[i].seq > 3 || a->map.log[i + 1].seq > 3)
+			return BTT_LOG_OOB_SEQ;
+	}
+	/*
+	 * Next, check only the 'new' slot in each lane for the remaining
+	 * entries being in bounds
+	 */
+	for (i = 0; i < a->nfree; i++) {
+		struct log_entry log;
+
+		rc = btt_log_read(a, i, &log);
+		if (rc)
+			return rc;
+
+		if (log.lba >= a->external_nlba)
+			return BTT_LOG_OOB_LBA;
+		if (log.old_map >= a->internal_nlba)
+			return BTT_LOG_OOB_OLD;
+		if (log.new_map >= a->internal_nlba)
+			return BTT_LOG_OOB_NEW;
+	}
+	return rc;
+}
+
+/* Check that log entries are self consistent */
+static int btt_check_map_entries(struct arena_info *a)
+{
+	int rc = 0, z, e;
+	unsigned int i;
+	__u32 mapping;
+
+	for (i = 0; i < a->external_nlba; i++) {
+		rc = btt_map_read(a, i, &mapping, &z, &e);
+		if (rc)
+			return rc;
+		if (mapping >= a->internal_nlba)
+			return BTT_MAP_OOB;
+	}
+	return 0;
+}
+
+/* Check that each flog entry has the correct corresponding map entry */
+static int btt_check_log_map(struct arena_info *a)
+{
+	unsigned int i;
+	__u32 mapping;
+	int rc = 0;
+
+	for (i = 0; i < a->nfree; i++) {
+		struct log_entry log;
+
+		rc = btt_log_read(a, i, &log);
+		if (rc)
+			return rc;
+		rc = btt_map_read(a, log.lba, &mapping, NULL, NULL);
+		if (rc)
+			return rc;
+
+		/*
+		 * Case where the flog was written, but map couldn't be updated.
+		 * The kernel should also be able to detect and fix this condition
+		 */
+		if (log.new_map != mapping && log.old_map == mapping) {
+			error("arena %d: log[%d].new_map (%#x) doesn't match map[%#x] (%#x)",
+				a->num, i, log.new_map, log.lba, mapping);
+			rc = btt_map_write(a, log.lba, log.new_map, 0, 0);
+			if (rc)
+				return BTT_LOG_MAP_ERR;
+		}
+	}
+
+	return rc;
+}
+
+static int btt_check_info2(struct arena_info *a)
+{
+	/*
+	 * Repair info2 if needed. The main info-block can be trusted
+	 * as it has been verified during arena discovery
+	 */
+	if(memcmp(a->map.info2, a->map.info, BTT_PG_SIZE))
+		return btt_copy_to_info2(a);
+	return 0;
+}
+
+static int btt_check_arenas(struct btt_chk *bttc)
+{
+	struct arena_info *a = NULL;
+	int i, rc;
+
+	for(i = 0; i < bttc->num_arenas; i++) {
+		printf("checking arena %d\n", i);
+		a = &bttc->arena[i];
+		rc = btt_check_log_entries(a);
+		if (rc)
+			break;
+		rc = btt_check_map_entries(a);
+		if (rc)
+			break;
+		rc = btt_check_log_map(a);
+		if (rc)
+			break;
+		rc = btt_check_info2(a);
+		if (rc)
+			break;
+	}
+
+	btt_xlat_status(a, rc);
+	return rc;
+}
+
+static bool is_namespace_offline(struct ndctl_namespace *ndns)
+{
+	struct ndctl_dax *dax = ndctl_namespace_get_dax(ndns);
+	struct ndctl_btt *btt = ndctl_namespace_get_btt(ndns);
+	struct ndctl_pfn *pfn = ndctl_namespace_get_pfn(ndns);
+
+	if (ndctl_namespace_is_enabled(ndns))
+		return false;
+
+	if (dax && ndctl_dax_is_enabled(dax))
+		return false;
+
+	if (btt && ndctl_btt_is_enabled(btt))
+		return false;
+
+	if (pfn && ndctl_pfn_is_enabled(pfn))
+		return false;
+
+	return true;
+}
+
+static int btt_recover_first_sb(struct btt_chk *bttc)
+{
+	__u64 offset, remaining = bttc->rawsize;
+	int rc, est_arenas = 0;
+	struct btt_sb *btt_sb;
+
+	/* Estimate the number of arenas */
+	while (remaining) {
+		if (remaining < ARENA_MIN_SIZE && est_arenas == 0)
+			return -EINVAL;
+		if (remaining > ARENA_MAX_SIZE) {
+			remaining -= ARENA_MAX_SIZE;
+			est_arenas++;
+			continue;
+		}
+		if (remaining < ARENA_MIN_SIZE)
+			break;
+		else {
+			remaining = 0;
+			est_arenas++;
+			break;
+		}
+	}
+
+	btt_sb = malloc(2 * sizeof(*btt_sb));
+	if (btt_sb == NULL)
+		return -ENOMEM;
+	/* Read the original first info block into btt_sb[0] */
+	rc = btt_read_info(bttc, &btt_sb[0], 0);
+	if (rc)
+		goto out;
+
+	/* Attepmt 1: try recovery from expected end of the first arena */
+	/*
+	 * Offset calculation has to account for btt_read_info adding the
+	 * extra 4K, so subtract it here
+	 */
+	if (est_arenas == 1)
+		offset = rounddown(bttc->rawsize - remaining, BTT_PG_SIZE) -
+			2 * BTT_PG_SIZE;
+	else
+		offset = ARENA_MAX_SIZE - 2 * BTT_PG_SIZE;
+
+	printf("Attempting recover info-block using info2 at offset %#llx\n", offset);
+	rc = btt_read_info(bttc, &btt_sb[1], offset);
+	if (rc)
+		goto out;
+	rc = btt_info_verify(bttc, &btt_sb[1]);
+	if (rc == 0) {
+		rc = btt_write_info(bttc, &btt_sb[1], 0);
+		goto out;
+	}
+
+	/*
+	 * Attempt 2: From the very end of 'rawsize', try to copy the fields
+	 * that are constant in every arena (only valid when multiple arenas
+	 * are present)
+	 */
+	if (est_arenas > 1) {
+		offset = rounddown(bttc->rawsize - remaining, BTT_PG_SIZE) -
+			2 * BTT_PG_SIZE;
+		printf("Attempting to recover info-block from end offset %#llx\n", offset);
+		rc = btt_read_info(bttc, &btt_sb[1], offset);
+		if (rc)
+			goto out;
+		/* copy over the arena0 specific fields from btt_sb[0] */
+		btt_sb[1].flags = btt_sb[0].flags;
+		btt_sb[1].external_nlba = btt_sb[0].external_nlba;
+		btt_sb[1].internal_nlba = btt_sb[0].internal_nlba;
+		btt_sb[1].nextoff = btt_sb[0].nextoff;
+		btt_sb[1].dataoff = btt_sb[0].dataoff;
+		btt_sb[1].mapoff = btt_sb[0].mapoff;
+		btt_sb[1].logoff = btt_sb[0].logoff;
+		btt_sb[1].info2off = btt_sb[0].info2off;
+		btt_sb[1].checksum = btt_sb[0].checksum;
+		rc = btt_info_verify(bttc, &btt_sb[1]);
+		if (rc == 0) {
+			rc = btt_write_info(bttc, &btt_sb[1], 0);
+			goto out;
+		}
+	}
+
+	/*
+	 * Attempt 3: use info2off as-is, and check if we find a valid info
+	 * block at that location.
+	 */
+	 offset = le32toh(btt_sb[0].info2off);
+	 if (offset) {
+		printf("Attempting to recover info-block from info2 offset (as-is) %#llx\n", offset);
+		rc = btt_read_info(bttc, &btt_sb[1], offset);
+		if (rc)
+			goto out;
+		rc = btt_info_verify(bttc, &btt_sb[1]);
+		if (rc == 0) {
+			rc = btt_write_info(bttc, &btt_sb[1], 0);
+			goto out;
+		}
+	} else
+		rc = -ENXIO;
+ out:
+	free(btt_sb);
+	return rc;
+}
+
+static int namespace_check(struct ndctl_region *region,
+		struct ndctl_namespace *ndns)
+{
+	const char *devname = ndctl_namespace_get_devname(ndns);
+	struct btt_chk *bttc;
+	struct btt_sb *btt_sb;
+	int raw_mode, rc;
+	char path[50];
+
+	printf("checking %s\n", devname);
+	if (!is_namespace_offline(ndns)) {
+		error("%s: check aborted, namespace online\n", devname);
+		return -EBUSY;
+	}
+
+	raw_mode = ndctl_namespace_get_raw_mode(ndns);
+	ndctl_namespace_set_raw_mode(ndns, 1);
+	rc = ndctl_namespace_enable(ndns);
+	if (rc != 0)
+		error("%s: failed to enable raw mode\n", devname);
+	ndctl_namespace_set_raw_mode(ndns, raw_mode);
+	sprintf(path, "/dev/%s", ndctl_namespace_get_block_device(ndns));
+
+	btt_sb = malloc(sizeof(*btt_sb));
+	if (btt_sb == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	bttc = calloc(1, sizeof(*bttc));
+	if (bttc == NULL) {
+		rc = -ENOMEM;
+		goto out_sb;
+	}
+	bttc->path = path;
+	bttc->rawsize = ndctl_namespace_get_size(ndns);
+	ndctl_namespace_get_uuid(ndns, bttc->parent_uuid);
+
+	rc = btt_read_info(bttc, btt_sb, 0);
+	if (rc)
+		goto out_bttc;
+	rc = btt_info_verify(bttc, btt_sb);
+	if (rc) {
+		rc = btt_recover_first_sb(bttc);
+		if (rc) {
+			error("Unable to recover any BTT info blocks, aborting\n");
+			goto out_bttc;
+		}
+		rc = btt_read_info(bttc, btt_sb, 0);
+		if (rc)
+			goto out_bttc;
+	}
+	rc = btt_discover_arenas(bttc);
+	if (rc)
+		goto out_bttc;
+
+	rc = btt_create_mappings(bttc);
+	if (rc)
+		goto out_bttc;
+
+	rc = btt_check_arenas(bttc);
+
+ out_bttc:
+	btt_remove_mappings(bttc);
+	free(bttc);
+ out_sb:
+	free(btt_sb);
+ out:
+	ndctl_namespace_disable(ndns);
+	return rc;
+}
+
+#else
+static int namespace_check(struct ndctl_region *region,
+		struct ndctl_namespace *ndns)
+{
+	return -ENOTTY;
+}
+#endif
+
 static int do_xaction_namespace(const char *namespace,
 		enum namespace_action action, struct ndctl_ctx *ctx)
 {
@@ -765,6 +1646,9 @@ static int do_xaction_namespace(const char *namespace,
 				case ACTION_DESTROY:
 					rc = namespace_destroy(region, ndns);
 					break;
+				case ACTION_CHECK:
+					rc = namespace_check(region, ndns);
+					break;
 				case ACTION_CREATE:
 					rc = namespace_reconfig(region, ndns);
 					if (rc < 0)
@@ -873,3 +1757,25 @@ int cmd_destroy_namespace(int argc , const char **argv, void *ctx)
 		return 0;
 	}
 }
+
+int cmd_check_namespace(int argc , const char **argv, struct ndctl_ctx *ctx)
+{
+	char *xable_usage = "ndctl check-namespace <namespace> [<options>]";
+	const char *namespace = parse_namespace_options(argc, argv,
+			ACTION_CHECK, check_options, xable_usage);
+	int checked;
+
+	checked = do_xaction_namespace(namespace, ACTION_CHECK, ctx);
+	if (checked < 0) {
+		fprintf(stderr, "error checking namespaces: %s\n",
+				strerror(-checked));
+		return checked;
+	} else if (checked == 0) {
+		fprintf(stderr, "checked 0 namespaces\n");
+		return 0;
+	} else {
+		fprintf(stderr, "checked %d namespace%s\n", checked,
+				checked > 1 ? "s" : "");
+		return 0;
+	}
+}
diff --git a/ndctl/ndctl.c b/ndctl/ndctl.c
index 80a0491..ec018c7 100644
--- a/ndctl/ndctl.c
+++ b/ndctl/ndctl.c
@@ -57,6 +57,9 @@ static struct cmd_struct commands[] = {
 	{ "disable-namespace", cmd_disable_namespace },
 	{ "create-namespace", cmd_create_namespace },
 	{ "destroy-namespace", cmd_destroy_namespace },
+	#ifdef ENABLE_CHECK_NAMESPACE
+	{ "check-namespace", cmd_check_namespace },
+	#endif
 	{ "enable-region", cmd_enable_region },
 	{ "disable-region", cmd_disable_region },
 	{ "enable-dimm", cmd_enable_dimm },
diff --git a/util/util.h b/util/util.h
index e0e5f26..d280d10 100644
--- a/util/util.h
+++ b/util/util.h
@@ -23,6 +23,13 @@
 
 #define alloc_nr(x) (((x)+16)*3/2)
 
+#define rounddown(x, y) (				\
+{							\
+	typeof(x) __x = (x);				\
+	__x - (__x % (y));				\
+}							\
+)
+
 /*
  * Realloc the buffer pointed at by variable 'x' so that it can hold
  * at least 'nr' entries; the number of entries currently allocated
@@ -44,6 +51,7 @@
 #define zfree(ptr) ({ free(*ptr); *ptr = NULL; })
 
 #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
 
 static inline const char *skip_prefix(const char *str, const char *prefix)
 {
-- 
2.9.3

_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] ndctl: add a BTT check utility
  2017-01-20  3:12 [PATCH] ndctl: add a BTT check utility Vishal Verma
@ 2017-01-20 20:04 ` Dan Williams
  2017-01-23 17:42   ` Ross Zwisler
  2017-01-20 21:00 ` Linda Knippers
       [not found] ` <20170120031234.24226-1-vishal.l.verma-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
  2 siblings, 1 reply; 11+ messages in thread
From: Dan Williams @ 2017-01-20 20:04 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-nvdimm

[ adding Jeff ]

Some comments on boilerplate / organization items...


On Thu, Jan 19, 2017 at 7:12 PM, Vishal Verma <vishal.l.verma@intel.com> wrote:
> Add the check-namespace command to ndctl. This will check the BTT
> metadata layout for the given namespace, and if requested, correct any
> errors found. Not all metadata corruption is detectable or fixable.
>
> Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
> ---
>  Documentation/Makefile.am               |   1 +
>  Documentation/namespace-check.txt       |   7 +
>  Documentation/ndctl-check-namespace.txt |  46 ++
>  Documentation/ndctl.txt                 |   1 +
>  builtin.h                               |   3 +
>  configure.ac                            |  10 +
>  contrib/ndctl                           |   3 +
>  ndctl/btt-structs.h                     | 117 ++++

Let's just call it btt.h

>  ndctl/builtin-xaction-namespace.c       | 910 +++++++++++++++++++++++++++++++-

This doubles the size of builtin-xaction-namespace.c. Let's move the
bulk of this code to it's own file. That way future diffstats can
differentiate create-namespace vs check-namespace changes.

>  ndctl/ndctl.c                           |   3 +
>  util/util.h                             |   8 +
>  11 files changed, 1107 insertions(+), 2 deletions(-)
>  create mode 100644 Documentation/namespace-check.txt
>  create mode 100644 Documentation/ndctl-check-namespace.txt
>  create mode 100644 ndctl/btt-structs.h
>
> diff --git a/Documentation/Makefile.am b/Documentation/Makefile.am
> index adcc9e7..92790b6 100644
> --- a/Documentation/Makefile.am
> +++ b/Documentation/Makefile.am
> @@ -12,6 +12,7 @@ man1_MANS = \
>         ndctl-disable-namespace.1 \
>         ndctl-create-namespace.1 \
>         ndctl-destroy-namespace.1 \
> +       ndctl-check-namespace.1 \
>         ndctl-list.1
>
>  CLEANFILES = $(man1_MANS)
> diff --git a/Documentation/namespace-check.txt b/Documentation/namespace-check.txt
> new file mode 100644
> index 0000000..d304e83
> --- /dev/null
> +++ b/Documentation/namespace-check.txt
> @@ -0,0 +1,7 @@
> +DESCRIPTION
> +-----------
> +
> +A namespace in the 'sector' mode will have metadata on it to describe
> +the kernel BTT (Block Translation Table). The check-namespace command
> +can be used to check the consistency of this metadata, and also attempt
> +to repair it, if it has enough information to do so.
> diff --git a/Documentation/ndctl-check-namespace.txt b/Documentation/ndctl-check-namespace.txt
> new file mode 100644
> index 0000000..18e44fc
> --- /dev/null
> +++ b/Documentation/ndctl-check-namespace.txt
> @@ -0,0 +1,46 @@
> +ndctl-check-namespace(1)
> +=========================
> +
> +NAME
> +----
> +ndctl-check-namespace - check namespace metadata consistency
> +
> +SYNOPSIS
> +--------
> +[verse]
> +'ndctl check-namespace' [<options>]
> +
> +include::namespace-check.txt[]

Since this is the only instance of the include just put the text here directly.

> +
> +EXAMPLES
> +--------
> +
> +Check a BTT namespace
> +[verse]
> +ndctl disable-namespace namespace0.0
> +ndctl check-namespace namespace0.0
> +
> +Check a BTT namespace, but don't actually restore/correct anything
> +[verse]
> +ndctl check-namespace --dry-run
> +
> +OPTIONS
> +-------
> +-n::
> +--dry-run::
> +       Only check the metadata for consistency, don't write anything.

Instead of opting *out* of repair, let's flip the logic and make it
opt-in with a --repair option. That aligns it with the way fsck
handles check vs repair.

> +
> +-v::
> +--verbose::
> +       Emit debug messages for the namespace creation process

s/creation/check/

> +
> +-r::
> +--region=::
> +include::xable-region-options.txt[]
> +
> +SEE ALSO
> +--------
> +linkndctl:ndctl-disable-namespace[1],
> +linkndctl:ndctl-enable-namespace[1],
> +http://pmem.io/documents/NVDIMM_Namespace_Spec.pdf[NVDIMM Namespace
> +Specification]
> diff --git a/Documentation/ndctl.txt b/Documentation/ndctl.txt
> index 883a59c..c26cc2f 100644
> --- a/Documentation/ndctl.txt
> +++ b/Documentation/ndctl.txt
> @@ -34,6 +34,7 @@ SEE ALSO
>  --------
>  linkndctl:ndctl-create-namespace[1],
>  linkndctl:ndctl-destroy-namespace[1],
> +linkndctl:ndctl-check-namespace[1],
>  linkndctl:ndctl-enable-region[1],
>  linkndctl:ndctl-disable-region[1],
>  linkndctl:ndctl-enable-dimm[1],
> diff --git a/builtin.h b/builtin.h
> index 9b66196..b817ced 100644
> --- a/builtin.h
> +++ b/builtin.h
> @@ -13,6 +13,9 @@ int cmd_enable_namespace(int argc, const char **argv, void *ctx);
>  int cmd_create_namespace(int argc, const char **argv, void *ctx);
>  int cmd_destroy_namespace(int argc, const char **argv, void *ctx);
>  int cmd_disable_namespace(int argc, const char **argv, void *ctx);
> +#ifdef ENABLE_CHECK_NAMESPACE
> +int cmd_check_namespace(int argc, const char **argv, struct *ctx);
> +#endif
>  int cmd_enable_region(int argc, const char **argv, void *ctx);
>  int cmd_disable_region(int argc, const char **argv, void *ctx);
>  int cmd_enable_dimm(int argc, const char **argv, void *ctx);
> diff --git a/configure.ac b/configure.ac
> index e79623a..92a0f06 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -111,6 +111,16 @@ fi
>  AC_SUBST([BASH_COMPLETION_DIR])
>  AM_CONDITIONAL([ENABLE_BASH_COMPLETION],[test "x$with_bash_completion_dir" != "xno"])
>
> +AC_ARG_ENABLE([check-namespace],
> +        AS_HELP_STRING([--disable-check-namespace],
> +               [disable the 'check namspace' command @<:@default=system@:>@]),
> +       [], [enable_check_namespace=yes])
> +

I don't see a reason to make this command optional since there are no
external dependencies.

> +AM_CONDITIONAL([ENABLE_CHECK_NAMESPACE], [test "x$enable_check_namespace" = "xyes"])
> +AS_IF([test "x$enable_check_namespace" = "xyes"], [
> +       AC_DEFINE([ENABLE_CHECK_NAMESPACE], [1], [check namespace support])
> +])
> +
>  AC_ARG_ENABLE([local],
>          AS_HELP_STRING([--disable-local], [build against kernel ndctl.h @<:@default=system@:>@]),
>          [], [enable_local=yes])
> diff --git a/contrib/ndctl b/contrib/ndctl
> index ea7303c..c97adcc 100755
> --- a/contrib/ndctl
> +++ b/contrib/ndctl
> @@ -194,6 +194,9 @@ __ndctl_comp_non_option_args()
>         destroy-namespace)
>                 opts="$(__ndctl_get_ns) all"
>                 ;;
> +       check-namespace)
> +               opts="$(__ndctl_get_ns -i) all"
> +               ;;
>         enable-region)
>                 opts="$(__ndctl_get_regions -i) all"
>                 ;;
> diff --git a/ndctl/btt-structs.h b/ndctl/btt-structs.h
> new file mode 100644
> index 0000000..1329668
> --- /dev/null
> +++ b/ndctl/btt-structs.h
> @@ -0,0 +1,117 @@
> +/*
> + * Copyright (c) 2016, Intel Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + */
> +
> +#ifndef _BTT_STRUCTS_H
> +#define _BTT_STRUCTS_H
> +
> +#include <linux/types.h>

Please switch this file over to using ccan/endian/endian.h like the
other endian aware code in ndctl. It happens to use the same type and
symbol names as the kernel which for me is easier to read and it
enables sparse to be able to check endian handling.

> +
> +#define BTT_SIG_LEN 16
> +#define BTT_SIG "BTT_ARENA_INFO\0"
> +#define MAP_ENT_SIZE 4
> +#define MAP_TRIM_SHIFT 31
> +#define MAP_TRIM_MASK (1 << MAP_TRIM_SHIFT)
> +#define MAP_ERR_SHIFT 30
> +#define MAP_ERR_MASK (1 << MAP_ERR_SHIFT)
> +#define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT)))
> +#define MAP_ENT_NORMAL 0xC0000000
> +#define LOG_ENT_SIZE sizeof(struct log_entry)
> +#define ARENA_MIN_SIZE (1UL << 24)     /* 16 MB */
> +#define ARENA_MAX_SIZE (1ULL << 39)    /* 512 GB */
> +#define BTT_PG_SIZE 4096
> +#define BTT_DEFAULT_NFREE 256
> +#define LOG_SEQ_INIT 1
> +
> +#define IB_FLAG_ERROR 0x00000001
> +#define IB_FLAG_ERROR_MASK 0x00000001
> +
> +struct log_entry {
> +       __le32 lba;
> +       __le32 old_map;
> +       __le32 new_map;
> +       __le32 seq;
> +       __le64 padding[2];
> +};
> +
> +struct btt_sb {
> +       __u8 signature[BTT_SIG_LEN];
> +       __u8 uuid[16];
> +       __u8 parent_uuid[16];
> +       __le32 flags;
> +       __le16 version_major;
> +       __le16 version_minor;
> +       __le32 external_lbasize;
> +       __le32 external_nlba;
> +       __le32 internal_lbasize;
> +       __le32 internal_nlba;
> +       __le32 nfree;
> +       __le32 infosize;
> +       __le64 nextoff;
> +       __le64 dataoff;
> +       __le64 mapoff;
> +       __le64 logoff;
> +       __le64 info2off;
> +       __u8 padding[3968];
> +       __le64 checksum;
> +};
> +
> +struct free_entry {
> +       __u32 block;
> +       __u8 sub;
> +       __u8 seq;
> +};
> +
> +struct arena_map {
> +       struct btt_sb *info;
> +       size_t info_len;
> +       void *data;
> +       size_t data_len;
> +       __u32 *map;
> +       size_t map_len;
> +       struct log_entry *log;
> +       size_t log_len;
> +       struct btt_sb *info2;
> +       size_t info2_len;
> +};
> +
> +struct arena_info {
> +       struct arena_map map;
> +       __u64 size;     /* Total bytes for this arena */
> +       __u64 external_lba_start;
> +       __u32 internal_nlba;
> +       __u32 internal_lbasize;
> +       __u32 external_nlba;
> +       __u32 external_lbasize;
> +       __u32 nfree;
> +       __u16 version_major;
> +       __u16 version_minor;
> +       __u64 nextoff;
> +       __u64 infooff;
> +       __u64 dataoff;
> +       __u64 mapoff;
> +       __u64 logoff;
> +       __u64 info2off;
> +       __u32 flags;
> +       int num;
> +};
> +
> +struct btt_chk {
> +       char *path;
> +       uuid_t parent_uuid;
> +       unsigned long long rawsize;
> +       unsigned long long nlba;
> +       int num_arenas;
> +       struct arena_info *arena;
> +};
> +
> +#endif
> diff --git a/ndctl/builtin-xaction-namespace.c b/ndctl/builtin-xaction-namespace.c
> index 2c4f85f..499670d 100644
> --- a/ndctl/builtin-xaction-namespace.c
> +++ b/ndctl/builtin-xaction-namespace.c
> @@ -13,20 +13,25 @@
>  #include <stdio.h>
>  #include <fcntl.h>
>  #include <errno.h>
> +#include <stdint.h>
>  #include <stdlib.h>
>  #include <unistd.h>
>  #include <limits.h>
>  #include <syslog.h>
> +#include <endian.h>
> +#include <sys/mman.h>
>  #include <sys/stat.h>
>  #include <uuid/uuid.h>
>  #include <sys/types.h>
>  #include <util/size.h>
>  #include <util/json.h>
> +#include <util/util.h>
>  #include <json-c/json.h>
>  #include <util/filter.h>
>  #include <ndctl/libndctl.h>
>  #include <util/parse-options.h>
>  #include <ccan/array_size/array_size.h>
> +#include "btt-structs.h"
>
>  #ifdef HAVE_NDCTL_H
>  #include <linux/ndctl.h>
> @@ -34,8 +39,11 @@
>  #include <ndctl.h>
>  #endif
>
> +#define SZ_4K 0x1000UL

If you rebase to the latest pending branch this define is now included
in util/sizes.h.

> +
>  static bool verbose;
>  static bool force;
> +static bool dryrun;
>  static struct parameters {
>         bool do_scan;
>         bool mode_default;
> @@ -106,6 +114,9 @@ OPT_STRING('t', "type", &param.type, "type", \
>         "specify the type of namespace to create 'pmem' or 'blk'"), \
>  OPT_BOOLEAN('f', "force", &force, "reconfigure namespace even if currently active")
>
> +#define CHECK_OPTIONS() \
> +OPT_BOOLEAN('n', "dry-run", &dryrun, "dry-run only, don't write anything")
> +
>  static const struct option base_options[] = {
>         BASE_OPTIONS(),
>         OPT_END(),
> @@ -124,11 +135,18 @@ static const struct option create_options[] = {
>         OPT_END(),
>  };
>
> +static const struct option check_options[] = {
> +       BASE_OPTIONS(),
> +       CHECK_OPTIONS(),
> +       OPT_END(),
> +};
> +
>  enum namespace_action {
>         ACTION_ENABLE,
>         ACTION_DISABLE,
>         ACTION_CREATE,
>         ACTION_DESTROY,
> +       ACTION_CHECK,
>  };
>
>  static int set_defaults(enum namespace_action mode)
> @@ -253,8 +271,23 @@ static const char *parse_namespace_options(int argc, const char **argv,
>         rc = set_defaults(mode);
>
>         if (argc == 0 && mode != ACTION_CREATE) {
> -               error("specify a namespace to %s, or \"all\"\n",
> -                               mode == ACTION_ENABLE ? "enable" : "disable");
> +               char *action_string;
> +
> +               switch (mode) {
> +                       case ACTION_ENABLE:
> +                               action_string = "enable";
> +                               break;
> +                       case ACTION_DISABLE:
> +                               action_string = "disable";
> +                               break;
> +                       case ACTION_CHECK:
> +                               action_string = "check";
> +                               break;

This seems to be missing ACTION_DESTROY

> +                       default:
> +                               action_string = "<>";
> +                               break;
> +               }
> +               error("specify a namespace to %s, or \"all\"\n", action_string);
>                 rc = -EINVAL;
>         }
>         for (i = mode == ACTION_CREATE ? 0 : 1; i < argc; i++) {
> @@ -703,6 +736,854 @@ static int namespace_reconfig(struct ndctl_region *region,
>         return setup_namespace(region, ndns, &p);
>  }
>
> +#ifdef ENABLE_CHECK_NAMESPACE
> +
> +static int dryrun_msg(void)
> +{
> +       error("  Run without --dry-run to make the changes");

Even though we're getting rid --dry-run this is an informational
usage() message, not an error()

> +       return 0;
> +}
> +
> +static int btt_read_info(struct btt_chk *bttc, struct btt_sb *btt_sb, __u64 off)
> +{
> +       int fd, rc = 0;
> +       ssize_t size;
> +
> +       fd = open(bttc->path, O_RDONLY);

Let's not open and close the btt device on each call. Open it once at
the beginning of time and open it O_EXCL so that we prevent the kernel
trying to claim the device. Let's also use O_DIRECT to bypass the
block_device page cache.

> +       if (fd < 0) {
> +               error("unable to open %s: %s\n", bttc->path, strerror(errno));
> +               return -errno;
> +       }
> +
> +       size = pread(fd, btt_sb, sizeof(*btt_sb), off + SZ_4K);
> +       if (size != sizeof(*btt_sb)) {
> +               error("unable to read first info block: %s\n", strerror(errno));
> +               rc = -errno;
> +       }
> +       close(fd);
> +       return rc;
> +}
> +
> +static int btt_write_info(struct btt_chk *bttc, struct btt_sb *btt_sb, __u64 off)
> +{
> +       int fd, rc = 0;
> +       ssize_t size;
> +
> +       if (dryrun) {
> +               error("BTT info block at offset %#llx needs to be restored\n", off);
> +               dryrun_msg();
> +               return -1;
> +       }
> +       printf("Restoring BTT info block at offset %#llx\n", off);
> +
> +       fd = open(bttc->path, O_RDWR);
> +       if (fd < 0) {
> +               error("unable to open %s: %s\n", bttc->path, strerror(errno));
> +               return -errno;
> +       }
> +
> +       size = pwrite(fd, btt_sb, sizeof(*btt_sb), off + SZ_4K);
> +       if (size != sizeof(*btt_sb)) {
> +               error("unable to write the info block: %s\n", strerror(errno));
> +               rc = -errno;
> +       }
> +       close(fd);
> +       return rc;
> +}
> +
> +static int btt_copy_to_info2(struct arena_info *a)
> +{
> +       if (dryrun) {
> +               error("Arena %d: BTT info2 needs to be restored\n", a->num);
> +               return dryrun_msg();
> +       }
> +       printf("Arena %d: Restoring BTT info2\n", a->num);
> +       memcpy(a->map.info2, a->map.info, BTT_PG_SIZE);
> +       return 0;
> +}
> +static int btt_map_read(struct arena_info *a, __u32 lba, __u32 *mapping,
> +                       int *trim, int *error)
> +{
> +       __u32 raw_mapping, postmap, ze, z_flag, e_flag;
> +
> +       raw_mapping = le32toh(a->map.map[lba]);

Switch to le32_to_cpu()...

> +       z_flag = (raw_mapping & MAP_TRIM_MASK) >> MAP_TRIM_SHIFT;
> +       e_flag = (raw_mapping & MAP_ERR_MASK) >> MAP_ERR_SHIFT;
> +       ze = (z_flag << 1) + e_flag;
> +       postmap = raw_mapping & MAP_LBA_MASK;
> +
> +       /* Reuse the {z,e}_flag variables for *trim and *error */
> +       z_flag = 0;
> +       e_flag = 0;
> +
> +       switch (ze) {
> +       case 0:
> +               /* Initial state. Return postmap = premap */
> +               *mapping = lba;
> +               break;
> +       case 1:
> +               *mapping = postmap;
> +               e_flag = 1;
> +               break;
> +       case 2:
> +               *mapping = postmap;
> +               z_flag = 1;
> +               break;
> +       case 3:
> +               *mapping = postmap;
> +               break;
> +       default:
> +               return -EIO;
> +       }
> +
> +       if (trim)
> +               *trim = z_flag;
> +       if (error)
> +               *error = e_flag;
> +
> +       return 0;
> +}
> +
> +static int btt_map_write(struct arena_info *a, __u32 lba, __u32 mapping,
> +                       __u32 z_flag, __u32 e_flag)
> +{
> +       __u32 ze;
> +
> +       /*
> +        * This 'mapping' is supposed to be just the LBA mapping, without
> +        * any flags set, so strip the flag bits.
> +        */
> +       mapping &= MAP_LBA_MASK;
> +
> +       if (dryrun) {
> +               error("Arena %d: map[%#x] needs to be updated to %#x\n",
> +                       a->num, lba, mapping);
> +               return dryrun_msg();
> +       }
> +       printf("Arena %d: Updating map[%#x] to %#x\n", a->num, lba, mapping);
> +
> +       ze = (z_flag << 1) + e_flag;
> +       switch (ze) {
> +       case 0:
> +               /*
> +                * We want to set neither of the Z or E flags, and
> +                * in the actual layout, this means setting the bit
> +                * positions of both to '1' to indicate a 'normal'
> +                * map entry
> +                */
> +               mapping |= MAP_ENT_NORMAL;
> +               break;
> +       case 1:
> +               mapping |= (1 << MAP_ERR_SHIFT);
> +               break;
> +       case 2:
> +               mapping |= (1 << MAP_TRIM_SHIFT);
> +               break;
> +       default:
> +               /*
> +                * The case where Z and E are both sent in as '1' could be
> +                * construed as a valid 'normal' case, but we decide not to,
> +                * to avoid confusion
> +                */
> +               error("%s: Invalid use of Z and E flags\n", __func__);
> +               return -ENXIO;
> +       }
> +
> +       a->map.map[lba] = htole32(mapping);
> +       if (msync((void *)rounddown((__u64)&a->map.map[lba], BTT_PG_SIZE),
> +               BTT_PG_SIZE, MS_SYNC) < 0)
> +               return errno;
> +
> +       return 0;
> +}
> +
> +static void btt_log_read_pair(struct arena_info *a, __u32 lane,
> +                       struct log_entry *ent)
> +{
> +       memcpy(ent, &a->map.log[lane * 2], 2 * sizeof(struct log_entry));
> +}
> +
> +/*
> + * This function accepts two log entries, and uses the sequence number to
> + * find the 'older' entry. The return value indicates which of the two was
> + * the 'old' entry
> + */
> +static int btt_log_get_old(struct log_entry *ent)
> +{
> +       int old;
> +
> +       if (ent[0].seq == 0) {
> +               ent[0].seq = htole32(1);
> +               return 0;
> +       }
> +
> +       if (le32toh(ent[0].seq) < le32toh(ent[1].seq)) {
> +               if (le32toh(ent[1].seq) - le32toh(ent[0].seq) == 1)
> +                       old = 0;
> +               else
> +                       old = 1;
> +       } else {
> +               if (le32toh(ent[0].seq) - le32toh(ent[1].seq) == 1)
> +                       old = 1;
> +               else
> +                       old = 0;
> +       }
> +
> +       return old;
> +}
> +
> +static int btt_log_read(struct arena_info *a, __u32 lane, struct log_entry *ent)
> +{
> +       int new_ent;
> +       struct log_entry log[2];
> +
> +       if (ent == NULL)
> +               return -EINVAL;
> +       btt_log_read_pair(a, lane, log);
> +       new_ent = 1 - btt_log_get_old(log);
> +       memcpy(ent, &log[new_ent], sizeof(struct log_entry));
> +       return 0;
> +}
> +
> +static uint64_t fletcher64(void *addr, size_t len, bool le)
> +{
> +       uint32_t *buf = addr;
> +       uint32_t lo32 = 0;
> +       uint64_t hi32 = 0;
> +       unsigned int i;
> +
> +       for (i = 0; i < len / sizeof(uint32_t); i++) {
> +               lo32 += le ? le32toh((__le32) buf[i]) : buf[i];
> +               hi32 += lo32;
> +       }
> +
> +       return ((uint64_t)(hi32 << 32)) | lo32;
> +}
> +

We already have this in ndctl/builtin-dimm.c. Move it to util/ so it
can be shared.

> +static int btt_checksum_verify(struct btt_sb *btt_sb)
> +{
> +       uint64_t sum;
> +       __le64 sum_save;
> +
> +       BUILD_BUG_ON(sizeof(struct btt_sb) != SZ_4K);
> +
> +       sum_save = btt_sb->checksum;
> +       btt_sb->checksum = 0;
> +       sum = fletcher64(btt_sb, sizeof(*btt_sb), 1);
> +       if (sum != sum_save)
> +               return 1;
> +       /* restore the checksum in the buffer */
> +       btt_sb->checksum = sum_save;
> +
> +       return 0;
> +}
> +
> +/*
> + * Never pass a mmapped buffer to this as it will attempt to write to
> + * the buffer, and we want writes to only happened in a controlled fashion.
> + * In the --dry-run case, even if such a buffer is passed, the write will
> + * result in a fault due to the readonly mmap flags.
> + */
> +static int btt_info_verify(struct btt_chk *bttc, struct btt_sb *btt_sb)
> +{
> +       if (memcmp(btt_sb->signature, BTT_SIG, BTT_SIG_LEN) != 0)
> +               return -ENXIO;
> +
> +       if (!uuid_is_null(btt_sb->parent_uuid))
> +               if (uuid_compare(bttc->parent_uuid, btt_sb->parent_uuid) != 0)
> +                       return -ENXIO;
> +
> +       if (btt_checksum_verify(btt_sb))
> +               return -ENXIO;
> +
> +       return 0;
> +}
> +
> +static void btt_parse_meta(struct arena_info *arena, struct btt_sb *btt_sb,
> +                               __u64 arena_off)
> +{
> +       arena->internal_nlba = le32toh(btt_sb->internal_nlba);
> +       arena->internal_lbasize = le32toh(btt_sb->internal_lbasize);
> +       arena->external_nlba = le32toh(btt_sb->external_nlba);
> +       arena->external_lbasize = le32toh(btt_sb->external_lbasize);
> +       arena->nfree = le32toh(btt_sb->nfree);
> +       arena->version_major = le16toh(btt_sb->version_major);
> +       arena->version_minor = le16toh(btt_sb->version_minor);
> +
> +       arena->nextoff = (btt_sb->nextoff == 0) ? 0 : (arena_off +
> +                       le64toh(btt_sb->nextoff));
> +       arena->infooff = arena_off;
> +       arena->dataoff = arena_off + le64toh(btt_sb->dataoff);
> +       arena->mapoff = arena_off + le64toh(btt_sb->mapoff);
> +       arena->logoff = arena_off + le64toh(btt_sb->logoff);
> +       arena->info2off = arena_off + le64toh(btt_sb->info2off);
> +
> +       arena->size = (le64toh(btt_sb->nextoff) > 0)
> +               ? (le64toh(btt_sb->nextoff))
> +               : (arena->info2off - arena->infooff + BTT_PG_SIZE);
> +
> +       arena->flags = le32toh(btt_sb->flags);
> +}
> +
> +static int btt_discover_arenas(struct btt_chk *bttc)
> +{
> +       int ret = 0;
> +       struct arena_info *arena;
> +       struct btt_sb *btt_sb;
> +       size_t remaining = bttc->rawsize;
> +       __u64 cur_nlba = 0;
> +       size_t cur_off = 0;
> +       int  i = 0;
> +
> +       btt_sb = calloc(1, sizeof(*btt_sb));
> +       if (!btt_sb)
> +               return -ENOMEM;
> +
> +       while (remaining) {
> +               /* Alloc memory for arena */
> +               arena = realloc(bttc->arena, (i + 1) * sizeof(*arena));
> +               if (!arena) {
> +                       ret = -ENOMEM;
> +                       goto out;
> +               } else {
> +                       bttc->arena = arena;
> +                       arena = &bttc->arena[i];
> +                       /* zero the new memory */
> +                       memset(arena, 0, sizeof(*arena));
> +               }
> +
> +               arena->infooff = cur_off;
> +               ret = btt_read_info(bttc, btt_sb, cur_off);
> +               if (ret)
> +                       goto out;
> +
> +               if (btt_info_verify(bttc, btt_sb) != 0) {
> +                       __u64 offset;
> +
> +                       /* Try to find the backup info block */
> +                       if (remaining <= ARENA_MAX_SIZE)
> +                               offset = rounddown(bttc->rawsize, BTT_PG_SIZE) -
> +                                       2 * BTT_PG_SIZE;
> +                       else
> +                               offset = cur_off + ARENA_MAX_SIZE - BTT_PG_SIZE;
> +
> +                       printf("Arena %d: Attempting recover info-block using info2\n", i);
> +                       ret = btt_read_info(bttc, btt_sb, offset);
> +                       if (ret) {
> +                               error("Unable to read backup info block (offset %lld)\n",
> +                                       offset);
> +                               goto out;
> +                       }
> +                       ret = btt_info_verify(bttc, btt_sb);
> +                       if (ret) {
> +                               error("Backup info block (offset %lld) verification failed\n",
> +                                       offset);
> +                               goto out;
> +                       }
> +                       ret = btt_write_info(bttc, btt_sb, cur_off);
> +               }
> +
> +               arena->external_lba_start = cur_nlba;
> +               btt_parse_meta(arena, btt_sb, cur_off);
> +
> +               remaining -= arena->size;
> +               cur_off += arena->size;
> +               cur_nlba += arena->external_nlba;
> +               arena->num = i;
> +               i++;
> +
> +               if (arena->nextoff == 0)
> +                       break;
> +       }
> +       bttc->num_arenas = i;
> +       bttc->nlba = cur_nlba;
> +       printf("found %d BTT arena%s\n", bttc->num_arenas,
> +               (bttc->num_arenas > 1) ? "s" : "");
> +       free(btt_sb);
> +       return ret;
> +
> + out:
> +       free(bttc->arena);
> +       free(btt_sb);
> +       return ret;
> +}
> +
> +static int btt_create_mappings(struct btt_chk *bttc)
> +{
> +       int open_flags, mmap_flags;
> +       struct arena_info *a;
> +       int fd, rc = 0, i;
> +
> +       if (dryrun) {
> +               open_flags = O_RDONLY;
> +               mmap_flags = PROT_READ;
> +       } else {
> +               open_flags = O_RDWR|O_EXCL;
> +               mmap_flags = PROT_READ|PROT_WRITE;

Ah, here's O_EXCL, but we should arrange for bttc have an open fd
already by this point.

> +       }
> +
> +       fd = open(bttc->path, open_flags);
> +       if (fd < 0) {
> +               error("unable to open %s: %s\n", bttc->path, strerror(errno));
> +               return -errno;
> +       }
> +
> +       for (i = 0; i < bttc->num_arenas; i++) {
> +               a = &bttc->arena[i];
> +               a->map.info_len = BTT_PG_SIZE;
> +               a->map.info = mmap(NULL, a->map.info_len, mmap_flags,
> +                       MAP_SHARED, fd, a->infooff + SZ_4K);
> +               if (a->map.info == MAP_FAILED) {
> +                       rc = errno;
> +                       error("mmap arena[%d].info [sz = %#lx, off = %#llx] failed: %d\n",
> +                                       i, a->map.info_len, a->infooff + SZ_4K, rc);
> +                       goto out;
> +               }
> +
> +               a->map.data_len = a->mapoff - a->dataoff;
> +               a->map.data = mmap(NULL, a->map.data_len, mmap_flags,
> +                       MAP_SHARED, fd, a->dataoff + SZ_4K);
> +               if (a->map.data == MAP_FAILED) {
> +                       rc = errno;
> +                       error("mmap arena[%d].data [sz = %#lx, off = %#llx] failed: %d\n",
> +                                       i, a->map.data_len, a->dataoff + SZ_4K, rc);
> +                       goto out;
> +               }
> +
> +               a->map.map_len = a->logoff - a->mapoff;
> +               a->map.map = mmap(NULL, a->map.map_len, mmap_flags,
> +                       MAP_SHARED, fd, a->mapoff + SZ_4K);
> +               if (a->map.map == MAP_FAILED) {
> +                       rc = errno;
> +                       error("mmap arena[%d].map [sz = %#lx, off = %#llx] failed: %d\n",
> +                                       i, a->map.map_len, a->mapoff + SZ_4K, rc);
> +                       goto out;
> +               }
> +
> +               a->map.log_len = a->info2off - a->logoff;
> +               a->map.log = mmap(NULL, a->map.log_len, mmap_flags,
> +                       MAP_SHARED, fd, a->logoff + SZ_4K);
> +               if (a->map.log == MAP_FAILED) {
> +                       rc = errno;
> +                       error("mmap arena[%d].log [sz = %#lx, off = %#llx] failed: %d\n",
> +                                       i, a->map.log_len, a->logoff + SZ_4K, rc);
> +                       goto out;
> +               }
> +
> +               a->map.info2_len = BTT_PG_SIZE;
> +               a->map.info2 = mmap(NULL, a->map.info2_len, mmap_flags,
> +                       MAP_SHARED, fd, a->info2off + SZ_4K);
> +               if (a->map.info2 == MAP_FAILED) {
> +                       rc = errno;
> +                       error("mmap arena[%d].info2 [sz = %#lx, off = %#llx] failed: %d\n",
> +                                       i, a->map.info2_len, a->info2off + SZ_4K, rc);
> +                       goto out;
> +               }
> +       }
> +
> + out:
> +       close(fd);
> +       return rc;
> +}
> +
> +static void btt_remove_mappings(struct btt_chk *bttc)
> +{
> +       struct arena_info *a;
> +       int i;
> +
> +       for (i = 0; i < bttc->num_arenas; i++) {
> +               a = &bttc->arena[i];
> +               if (a->map.info)
> +                       munmap(a->map.info, a->map.info_len);
> +               if (a->map.data)
> +                       munmap(a->map.data, a->map.data_len);
> +               if (a->map.map)
> +                       munmap(a->map.map, a->map.map_len);
> +               if (a->map.log)
> +                       munmap(a->map.log, a->map.log_len);
> +               if (a->map.info2)
> +                       munmap(a->map.info2, a->map.info2_len);
> +       }
> +}
> +
> +enum btt_errcodes {
> +       BTT_OK = 0,
> +       BTT_LOG_EQL_SEQ = 0x100,
> +       BTT_LOG_OOB_SEQ,
> +       BTT_LOG_OOB_LBA,
> +       BTT_LOG_OOB_OLD,
> +       BTT_LOG_OOB_NEW,
> +       BTT_LOG_MAP_ERR,
> +       BTT_MAP_OOB,
> +};
> +
> +static void btt_xlat_status(struct arena_info *a, int errcode)
> +{
> +       switch(errcode) {
> +       case BTT_OK:
> +               break;
> +       case BTT_LOG_EQL_SEQ:
> +               error("arena %d: found a pair of log entries with the same sequence number\n",
> +                       a->num);
> +               break;
> +       case BTT_LOG_OOB_SEQ:
> +               error("arena %d: found a log entry with an out of bounds sequence number\n",
> +                       a->num);
> +               break;
> +       case BTT_LOG_OOB_LBA:
> +               error("arena %d: found a log entry with an out of bounds LBA\n",
> +                       a->num);
> +               break;
> +       case BTT_LOG_OOB_OLD:
> +               error("arena %d: found a log entry with an out of bounds 'old' mapping\n",
> +                       a->num);
> +               break;
> +       case BTT_LOG_OOB_NEW:
> +               error("arena %d: found a log entry with an out of bounds 'new' mapping\n",
> +                       a->num);
> +               break;
> +       case BTT_LOG_MAP_ERR:
> +               error("arena %d: found a log entry that does not match with a map entry\n",
> +                       a->num);
> +               break;
> +       case BTT_MAP_OOB:
> +               error("arena %d: found a map entry that is out of bounds\n",
> +                       a->num);
> +               break;
> +       default:
> +               error("arena %d: unknown error: %d\n", a->num, errcode);
> +       }

Are these all errors? Seems we need an info() method. You might switch
to using util/log.c as we might want to suppress this level of print
detail by default. Only if the user specifies verbose mode should we
print out each error I think.

> +}
> +
> +/* Check that log entries are self consistent */
> +static int btt_check_log_entries(struct arena_info *a)
> +{
> +       unsigned int i;
> +       int rc = 0;
> +
> +       /*
> +        * First, check both 'slots' for sequence numbers being distinct
> +        * and in bounds
> +        */
> +       for (i = 0; i < (2 * a->nfree); i+=2) {
> +               if (a->map.log[i].seq == a->map.log[i + 1].seq)
> +                       return BTT_LOG_EQL_SEQ;
> +               if (a->map.log[i].seq > 3 || a->map.log[i + 1].seq > 3)
> +                       return BTT_LOG_OOB_SEQ;
> +       }
> +       /*
> +        * Next, check only the 'new' slot in each lane for the remaining
> +        * entries being in bounds
> +        */
> +       for (i = 0; i < a->nfree; i++) {
> +               struct log_entry log;
> +
> +               rc = btt_log_read(a, i, &log);
> +               if (rc)
> +                       return rc;
> +
> +               if (log.lba >= a->external_nlba)
> +                       return BTT_LOG_OOB_LBA;
> +               if (log.old_map >= a->internal_nlba)
> +                       return BTT_LOG_OOB_OLD;
> +               if (log.new_map >= a->internal_nlba)
> +                       return BTT_LOG_OOB_NEW;
> +       }
> +       return rc;
> +}
> +
> +/* Check that log entries are self consistent */
> +static int btt_check_map_entries(struct arena_info *a)
> +{
> +       int rc = 0, z, e;
> +       unsigned int i;
> +       __u32 mapping;
> +
> +       for (i = 0; i < a->external_nlba; i++) {
> +               rc = btt_map_read(a, i, &mapping, &z, &e);
> +               if (rc)
> +                       return rc;
> +               if (mapping >= a->internal_nlba)
> +                       return BTT_MAP_OOB;
> +       }
> +       return 0;
> +}
> +
> +/* Check that each flog entry has the correct corresponding map entry */
> +static int btt_check_log_map(struct arena_info *a)
> +{
> +       unsigned int i;
> +       __u32 mapping;
> +       int rc = 0;
> +
> +       for (i = 0; i < a->nfree; i++) {
> +               struct log_entry log;
> +
> +               rc = btt_log_read(a, i, &log);
> +               if (rc)
> +                       return rc;
> +               rc = btt_map_read(a, log.lba, &mapping, NULL, NULL);
> +               if (rc)
> +                       return rc;
> +
> +               /*
> +                * Case where the flog was written, but map couldn't be updated.
> +                * The kernel should also be able to detect and fix this condition
> +                */
> +               if (log.new_map != mapping && log.old_map == mapping) {
> +                       error("arena %d: log[%d].new_map (%#x) doesn't match map[%#x] (%#x)",
> +                               a->num, i, log.new_map, log.lba, mapping);
> +                       rc = btt_map_write(a, log.lba, log.new_map, 0, 0);
> +                       if (rc)
> +                               return BTT_LOG_MAP_ERR;
> +               }
> +       }
> +
> +       return rc;
> +}
> +
> +static int btt_check_info2(struct arena_info *a)
> +{
> +       /*
> +        * Repair info2 if needed. The main info-block can be trusted
> +        * as it has been verified during arena discovery
> +        */
> +       if(memcmp(a->map.info2, a->map.info, BTT_PG_SIZE))
> +               return btt_copy_to_info2(a);
> +       return 0;
> +}
> +
> +static int btt_check_arenas(struct btt_chk *bttc)
> +{
> +       struct arena_info *a = NULL;
> +       int i, rc;
> +
> +       for(i = 0; i < bttc->num_arenas; i++) {
> +               printf("checking arena %d\n", i);
> +               a = &bttc->arena[i];
> +               rc = btt_check_log_entries(a);
> +               if (rc)
> +                       break;
> +               rc = btt_check_map_entries(a);
> +               if (rc)
> +                       break;
> +               rc = btt_check_log_map(a);
> +               if (rc)
> +                       break;
> +               rc = btt_check_info2(a);
> +               if (rc)
> +                       break;
> +       }
> +
> +       btt_xlat_status(a, rc);
> +       return rc;
> +}
> +
> +static bool is_namespace_offline(struct ndctl_namespace *ndns)
> +{
> +       struct ndctl_dax *dax = ndctl_namespace_get_dax(ndns);
> +       struct ndctl_btt *btt = ndctl_namespace_get_btt(ndns);
> +       struct ndctl_pfn *pfn = ndctl_namespace_get_pfn(ndns);
> +
> +       if (ndctl_namespace_is_enabled(ndns))
> +               return false;
> +
> +       if (dax && ndctl_dax_is_enabled(dax))
> +               return false;
> +
> +       if (btt && ndctl_btt_is_enabled(btt))
> +               return false;
> +
> +       if (pfn && ndctl_pfn_is_enabled(pfn))
> +               return false;
> +
> +       return true;
> +}

Maybe we should make this a libndctl api, I think I've seen this pattern before.
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ndctl: add a BTT check utility
  2017-01-20  3:12 [PATCH] ndctl: add a BTT check utility Vishal Verma
  2017-01-20 20:04 ` Dan Williams
@ 2017-01-20 21:00 ` Linda Knippers
  2017-01-24 22:58   ` Verma, Vishal L
       [not found] ` <20170120031234.24226-1-vishal.l.verma-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
  2 siblings, 1 reply; 11+ messages in thread
From: Linda Knippers @ 2017-01-20 21:00 UTC (permalink / raw)
  To: Vishal Verma, linux-nvdimm

Hi Vishal,

It's nice to see a BTT check utility.

I have a few high level questions/comments below.

-- ljk

On 01/19/2017 10:12 PM, Vishal Verma wrote:
> Add the check-namespace command to ndctl. This will check the BTT
> metadata layout for the given namespace, and if requested, correct any
> errors found. Not all metadata corruption is detectable or fixable.
> 
> Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
> ---
>  Documentation/Makefile.am               |   1 +
>  Documentation/namespace-check.txt       |   7 +
>  Documentation/ndctl-check-namespace.txt |  46 ++
>  Documentation/ndctl.txt                 |   1 +
>  builtin.h                               |   3 +
>  configure.ac                            |  10 +
>  contrib/ndctl                           |   3 +
>  ndctl/btt-structs.h                     | 117 ++++
>  ndctl/builtin-xaction-namespace.c       | 910 +++++++++++++++++++++++++++++++-
>  ndctl/ndctl.c                           |   3 +
>  util/util.h                             |   8 +
>  11 files changed, 1107 insertions(+), 2 deletions(-)
>  create mode 100644 Documentation/namespace-check.txt
>  create mode 100644 Documentation/ndctl-check-namespace.txt
>  create mode 100644 ndctl/btt-structs.h
> 
> diff --git a/Documentation/Makefile.am b/Documentation/Makefile.am
> index adcc9e7..92790b6 100644
> --- a/Documentation/Makefile.am
> +++ b/Documentation/Makefile.am
> @@ -12,6 +12,7 @@ man1_MANS = \
>  	ndctl-disable-namespace.1 \
>  	ndctl-create-namespace.1 \
>  	ndctl-destroy-namespace.1 \
> +	ndctl-check-namespace.1 \
>  	ndctl-list.1
>  
>  CLEANFILES = $(man1_MANS)
> diff --git a/Documentation/namespace-check.txt b/Documentation/namespace-check.txt
> new file mode 100644
> index 0000000..d304e83
> --- /dev/null
> +++ b/Documentation/namespace-check.txt
> @@ -0,0 +1,7 @@
> +DESCRIPTION
> +-----------
> +
> +A namespace in the 'sector' mode will have metadata on it to describe
> +the kernel BTT (Block Translation Table). The check-namespace command
> +can be used to check the consistency of this metadata, and also attempt
> +to repair it, if it has enough information to do so.
> diff --git a/Documentation/ndctl-check-namespace.txt b/Documentation/ndctl-check-namespace.txt
> new file mode 100644
> index 0000000..18e44fc
> --- /dev/null
> +++ b/Documentation/ndctl-check-namespace.txt
> @@ -0,0 +1,46 @@
> +ndctl-check-namespace(1)
> +=========================
> +
> +NAME
> +----
> +ndctl-check-namespace - check namespace metadata consistency
> +
> +SYNOPSIS
> +--------
> +[verse]
> +'ndctl check-namespace' [<options>]
> +
> +include::namespace-check.txt[]
> +
> +EXAMPLES
> +--------
> +
> +Check a BTT namespace
> +[verse]
> +ndctl disable-namespace namespace0.0
> +ndctl check-namespace namespace0.0

If the namespace has to be disabled to be checked, it's probably worth saying in
the text as well as in the example.

> +
> +Check a BTT namespace, but don't actually restore/correct anything
> +[verse]
> +ndctl check-namespace --dry-run

Does the namespace not have to be disabled for the --dry-run option?


> +
> +OPTIONS
> +-------
> +-n::
> +--dry-run::
> +	Only check the metadata for consistency, don't write anything.
> +
> +-v::
> +--verbose::
> +	Emit debug messages for the namespace creation process
> +
> +-r::
> +--region=::
> +include::xable-region-options.txt[]
> +
> +SEE ALSO
> +--------
> +linkndctl:ndctl-disable-namespace[1],
> +linkndctl:ndctl-enable-namespace[1],
> +http://pmem.io/documents/NVDIMM_Namespace_Spec.pdf[NVDIMM Namespace
> +Specification]
> diff --git a/Documentation/ndctl.txt b/Documentation/ndctl.txt
> index 883a59c..c26cc2f 100644
> --- a/Documentation/ndctl.txt
> +++ b/Documentation/ndctl.txt
> @@ -34,6 +34,7 @@ SEE ALSO
>  --------
>  linkndctl:ndctl-create-namespace[1],
>  linkndctl:ndctl-destroy-namespace[1],
> +linkndctl:ndctl-check-namespace[1],
>  linkndctl:ndctl-enable-region[1],
>  linkndctl:ndctl-disable-region[1],
>  linkndctl:ndctl-enable-dimm[1],
> diff --git a/builtin.h b/builtin.h
> index 9b66196..b817ced 100644
> --- a/builtin.h
> +++ b/builtin.h
> @@ -13,6 +13,9 @@ int cmd_enable_namespace(int argc, const char **argv, void *ctx);
>  int cmd_create_namespace(int argc, const char **argv, void *ctx);
>  int cmd_destroy_namespace(int argc, const char **argv, void *ctx);
>  int cmd_disable_namespace(int argc, const char **argv, void *ctx);
> +#ifdef ENABLE_CHECK_NAMESPACE
> +int cmd_check_namespace(int argc, const char **argv, struct *ctx);
> +#endif
>  int cmd_enable_region(int argc, const char **argv, void *ctx);
>  int cmd_disable_region(int argc, const char **argv, void *ctx);
>  int cmd_enable_dimm(int argc, const char **argv, void *ctx);
> diff --git a/configure.ac b/configure.ac
> index e79623a..92a0f06 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -111,6 +111,16 @@ fi
>  AC_SUBST([BASH_COMPLETION_DIR])
>  AM_CONDITIONAL([ENABLE_BASH_COMPLETION],[test "x$with_bash_completion_dir" != "xno"])
>  
> +AC_ARG_ENABLE([check-namespace],
> +        AS_HELP_STRING([--disable-check-namespace],
> +		[disable the 'check namspace' command @<:@default=system@:>@]),
> +	[], [enable_check_namespace=yes])
> +
> +AM_CONDITIONAL([ENABLE_CHECK_NAMESPACE], [test "x$enable_check_namespace" = "xyes"])
> +AS_IF([test "x$enable_check_namespace" = "xyes"], [
> +	AC_DEFINE([ENABLE_CHECK_NAMESPACE], [1], [check namespace support])
> +])
> +
>  AC_ARG_ENABLE([local],
>          AS_HELP_STRING([--disable-local], [build against kernel ndctl.h @<:@default=system@:>@]),
>          [], [enable_local=yes])
> diff --git a/contrib/ndctl b/contrib/ndctl
> index ea7303c..c97adcc 100755
> --- a/contrib/ndctl
> +++ b/contrib/ndctl
> @@ -194,6 +194,9 @@ __ndctl_comp_non_option_args()
>  	destroy-namespace)
>  		opts="$(__ndctl_get_ns) all"
>  		;;
> +	check-namespace)
> +		opts="$(__ndctl_get_ns -i) all"
> +		;;
>  	enable-region)
>  		opts="$(__ndctl_get_regions -i) all"
>  		;;
> diff --git a/ndctl/btt-structs.h b/ndctl/btt-structs.h
> new file mode 100644
> index 0000000..1329668
> --- /dev/null
> +++ b/ndctl/btt-structs.h
> @@ -0,0 +1,117 @@
> +/*
> + * Copyright (c) 2016, Intel Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + */
> +
> +#ifndef _BTT_STRUCTS_H
> +#define _BTT_STRUCTS_H
> +
> +#include <linux/types.h>
> +
> +#define BTT_SIG_LEN 16
> +#define BTT_SIG "BTT_ARENA_INFO\0"
> +#define MAP_ENT_SIZE 4
> +#define MAP_TRIM_SHIFT 31
> +#define MAP_TRIM_MASK (1 << MAP_TRIM_SHIFT)
> +#define MAP_ERR_SHIFT 30
> +#define MAP_ERR_MASK (1 << MAP_ERR_SHIFT)
> +#define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT)))
> +#define MAP_ENT_NORMAL 0xC0000000
> +#define LOG_ENT_SIZE sizeof(struct log_entry)
> +#define ARENA_MIN_SIZE (1UL << 24)	/* 16 MB */
> +#define ARENA_MAX_SIZE (1ULL << 39)	/* 512 GB */
> +#define BTT_PG_SIZE 4096
> +#define BTT_DEFAULT_NFREE 256
> +#define LOG_SEQ_INIT 1
> +
> +#define IB_FLAG_ERROR 0x00000001
> +#define IB_FLAG_ERROR_MASK 0x00000001
> +
> +struct log_entry {
> +	__le32 lba;
> +	__le32 old_map;
> +	__le32 new_map;
> +	__le32 seq;
> +	__le64 padding[2];
> +};
> +
> +struct btt_sb {
> +	__u8 signature[BTT_SIG_LEN];
> +	__u8 uuid[16];
> +	__u8 parent_uuid[16];
> +	__le32 flags;
> +	__le16 version_major;
> +	__le16 version_minor;
> +	__le32 external_lbasize;
> +	__le32 external_nlba;
> +	__le32 internal_lbasize;
> +	__le32 internal_nlba;
> +	__le32 nfree;
> +	__le32 infosize;
> +	__le64 nextoff;
> +	__le64 dataoff;
> +	__le64 mapoff;
> +	__le64 logoff;
> +	__le64 info2off;
> +	__u8 padding[3968];
> +	__le64 checksum;
> +};
> +
> +struct free_entry {
> +	__u32 block;
> +	__u8 sub;
> +	__u8 seq;
> +};
> +
> +struct arena_map {
> +	struct btt_sb *info;
> +	size_t info_len;
> +	void *data;
> +	size_t data_len;
> +	__u32 *map;
> +	size_t map_len;
> +	struct log_entry *log;
> +	size_t log_len;
> +	struct btt_sb *info2;
> +	size_t info2_len;
> +};
> +
> +struct arena_info {
> +	struct arena_map map;
> +	__u64 size;	/* Total bytes for this arena */
> +	__u64 external_lba_start;
> +	__u32 internal_nlba;
> +	__u32 internal_lbasize;
> +	__u32 external_nlba;
> +	__u32 external_lbasize;
> +	__u32 nfree;
> +	__u16 version_major;
> +	__u16 version_minor;
> +	__u64 nextoff;
> +	__u64 infooff;
> +	__u64 dataoff;
> +	__u64 mapoff;
> +	__u64 logoff;
> +	__u64 info2off;
> +	__u32 flags;
> +	int num;
> +};
> +
> +struct btt_chk {
> +	char *path;
> +	uuid_t parent_uuid;
> +	unsigned long long rawsize;
> +	unsigned long long nlba;
> +	int num_arenas;
> +	struct arena_info *arena;
> +};
> +
> +#endif
> diff --git a/ndctl/builtin-xaction-namespace.c b/ndctl/builtin-xaction-namespace.c
> index 2c4f85f..499670d 100644
> --- a/ndctl/builtin-xaction-namespace.c
> +++ b/ndctl/builtin-xaction-namespace.c
> @@ -13,20 +13,25 @@
>  #include <stdio.h>
>  #include <fcntl.h>
>  #include <errno.h>
> +#include <stdint.h>
>  #include <stdlib.h>
>  #include <unistd.h>
>  #include <limits.h>
>  #include <syslog.h>
> +#include <endian.h>
> +#include <sys/mman.h>
>  #include <sys/stat.h>
>  #include <uuid/uuid.h>
>  #include <sys/types.h>
>  #include <util/size.h>
>  #include <util/json.h>
> +#include <util/util.h>
>  #include <json-c/json.h>
>  #include <util/filter.h>
>  #include <ndctl/libndctl.h>
>  #include <util/parse-options.h>
>  #include <ccan/array_size/array_size.h>
> +#include "btt-structs.h"
>  
>  #ifdef HAVE_NDCTL_H
>  #include <linux/ndctl.h>
> @@ -34,8 +39,11 @@
>  #include <ndctl.h>
>  #endif
>  
> +#define SZ_4K 0x1000UL
> +
>  static bool verbose;
>  static bool force;
> +static bool dryrun;
>  static struct parameters {
>  	bool do_scan;
>  	bool mode_default;
> @@ -106,6 +114,9 @@ OPT_STRING('t', "type", &param.type, "type", \
>  	"specify the type of namespace to create 'pmem' or 'blk'"), \
>  OPT_BOOLEAN('f', "force", &force, "reconfigure namespace even if currently active")
>  
> +#define CHECK_OPTIONS() \
> +OPT_BOOLEAN('n', "dry-run", &dryrun, "dry-run only, don't write anything")
> +
>  static const struct option base_options[] = {
>  	BASE_OPTIONS(),
>  	OPT_END(),
> @@ -124,11 +135,18 @@ static const struct option create_options[] = {
>  	OPT_END(),
>  };
>  
> +static const struct option check_options[] = {
> +	BASE_OPTIONS(),
> +	CHECK_OPTIONS(),
> +	OPT_END(),
> +};
> +
>  enum namespace_action {
>  	ACTION_ENABLE,
>  	ACTION_DISABLE,
>  	ACTION_CREATE,
>  	ACTION_DESTROY,
> +	ACTION_CHECK,
>  };
>  
>  static int set_defaults(enum namespace_action mode)
> @@ -253,8 +271,23 @@ static const char *parse_namespace_options(int argc, const char **argv,
>  	rc = set_defaults(mode);
>  
>  	if (argc == 0 && mode != ACTION_CREATE) {
> -		error("specify a namespace to %s, or \"all\"\n",
> -				mode == ACTION_ENABLE ? "enable" : "disable");
> +		char *action_string;
> +
> +		switch (mode) {
> +			case ACTION_ENABLE:
> +				action_string = "enable";
> +				break;
> +			case ACTION_DISABLE:
> +				action_string = "disable";
> +				break;
> +			case ACTION_CHECK:
> +				action_string = "check";
> +				break;
> +			default:
> +				action_string = "<>";
> +				break;
> +		}
> +		error("specify a namespace to %s, or \"all\"\n", action_string);
>  		rc = -EINVAL;
>  	}
>  	for (i = mode == ACTION_CREATE ? 0 : 1; i < argc; i++) {
> @@ -703,6 +736,854 @@ static int namespace_reconfig(struct ndctl_region *region,
>  	return setup_namespace(region, ndns, &p);
>  }
>  
> +#ifdef ENABLE_CHECK_NAMESPACE
> +
> +static int dryrun_msg(void)
> +{
> +	error("  Run without --dry-run to make the changes");
> +	return 0;
> +}
> +
> +static int btt_read_info(struct btt_chk *bttc, struct btt_sb *btt_sb, __u64 off)
> +{
> +	int fd, rc = 0;
> +	ssize_t size;
> +
> +	fd = open(bttc->path, O_RDONLY);
> +	if (fd < 0) {
> +		error("unable to open %s: %s\n", bttc->path, strerror(errno));
> +		return -errno;
> +	}
> +
> +	size = pread(fd, btt_sb, sizeof(*btt_sb), off + SZ_4K);
> +	if (size != sizeof(*btt_sb)) {
> +		error("unable to read first info block: %s\n", strerror(errno));
> +		rc = -errno;
> +	}
> +	close(fd);
> +	return rc;
> +}
> +
> +static int btt_write_info(struct btt_chk *bttc, struct btt_sb *btt_sb, __u64 off)
> +{
> +	int fd, rc = 0;
> +	ssize_t size;
> +
> +	if (dryrun) {
> +		error("BTT info block at offset %#llx needs to be restored\n", off);
> +		dryrun_msg();
> +		return -1;
> +	}
> +	printf("Restoring BTT info block at offset %#llx\n", off);
> +
> +	fd = open(bttc->path, O_RDWR);
> +	if (fd < 0) {
> +		error("unable to open %s: %s\n", bttc->path, strerror(errno));
> +		return -errno;
> +	}
> +
> +	size = pwrite(fd, btt_sb, sizeof(*btt_sb), off + SZ_4K);
> +	if (size != sizeof(*btt_sb)) {
> +		error("unable to write the info block: %s\n", strerror(errno));
> +		rc = -errno;
> +	}
> +	close(fd);
> +	return rc;
> +}
> +
> +static int btt_copy_to_info2(struct arena_info *a)
> +{
> +	if (dryrun) {
> +		error("Arena %d: BTT info2 needs to be restored\n", a->num);
> +		return dryrun_msg();
> +	}
> +	printf("Arena %d: Restoring BTT info2\n", a->num);
> +	memcpy(a->map.info2, a->map.info, BTT_PG_SIZE);
> +	return 0;
> +}
> +static int btt_map_read(struct arena_info *a, __u32 lba, __u32 *mapping,
> +			int *trim, int *error)
> +{
> +	__u32 raw_mapping, postmap, ze, z_flag, e_flag;
> +
> +	raw_mapping = le32toh(a->map.map[lba]);
> +	z_flag = (raw_mapping & MAP_TRIM_MASK) >> MAP_TRIM_SHIFT;
> +	e_flag = (raw_mapping & MAP_ERR_MASK) >> MAP_ERR_SHIFT;
> +	ze = (z_flag << 1) + e_flag;
> +	postmap = raw_mapping & MAP_LBA_MASK;
> +
> +	/* Reuse the {z,e}_flag variables for *trim and *error */
> +	z_flag = 0;
> +	e_flag = 0;
> +
> +	switch (ze) {
> +	case 0:
> +		/* Initial state. Return postmap = premap */
> +		*mapping = lba;
> +		break;
> +	case 1:
> +		*mapping = postmap;
> +		e_flag = 1;
> +		break;
> +	case 2:
> +		*mapping = postmap;
> +		z_flag = 1;
> +		break;
> +	case 3:
> +		*mapping = postmap;
> +		break;
> +	default:
> +		return -EIO;
> +	}
> +
> +	if (trim)
> +		*trim = z_flag;
> +	if (error)
> +		*error = e_flag;
> +
> +	return 0;
> +}
> +
> +static int btt_map_write(struct arena_info *a, __u32 lba, __u32 mapping,
> +			__u32 z_flag, __u32 e_flag)
> +{
> +	__u32 ze;
> +
> +	/*
> +	 * This 'mapping' is supposed to be just the LBA mapping, without
> +	 * any flags set, so strip the flag bits.
> +	 */
> +	mapping &= MAP_LBA_MASK;
> +
> +	if (dryrun) {
> +		error("Arena %d: map[%#x] needs to be updated to %#x\n",
> +			a->num, lba, mapping);
> +		return dryrun_msg();
> +	}
> +	printf("Arena %d: Updating map[%#x] to %#x\n", a->num, lba, mapping);
> +
> +	ze = (z_flag << 1) + e_flag;
> +	switch (ze) {
> +	case 0:
> +		/*
> +		 * We want to set neither of the Z or E flags, and
> +		 * in the actual layout, this means setting the bit
> +		 * positions of both to '1' to indicate a 'normal'
> +		 * map entry
> +		 */
> +		mapping |= MAP_ENT_NORMAL;
> +		break;
> +	case 1:
> +		mapping |= (1 << MAP_ERR_SHIFT);
> +		break;
> +	case 2:
> +		mapping |= (1 << MAP_TRIM_SHIFT);
> +		break;
> +	default:
> +		/*
> +		 * The case where Z and E are both sent in as '1' could be
> +		 * construed as a valid 'normal' case, but we decide not to,
> +		 * to avoid confusion
> +		 */
> +		error("%s: Invalid use of Z and E flags\n", __func__);
> +		return -ENXIO;
> +	}
> +
> +	a->map.map[lba] = htole32(mapping);
> +	if (msync((void *)rounddown((__u64)&a->map.map[lba], BTT_PG_SIZE),
> +		BTT_PG_SIZE, MS_SYNC) < 0)
> +		return errno;
> +
> +	return 0;
> +}
> +
> +static void btt_log_read_pair(struct arena_info *a, __u32 lane,
> +			struct log_entry *ent)
> +{
> +	memcpy(ent, &a->map.log[lane * 2], 2 * sizeof(struct log_entry));
> +}
> +
> +/*
> + * This function accepts two log entries, and uses the sequence number to
> + * find the 'older' entry. The return value indicates which of the two was
> + * the 'old' entry
> + */
> +static int btt_log_get_old(struct log_entry *ent)
> +{
> +	int old;
> +
> +	if (ent[0].seq == 0) {
> +		ent[0].seq = htole32(1);
> +		return 0;
> +	}
> +
> +	if (le32toh(ent[0].seq) < le32toh(ent[1].seq)) {
> +		if (le32toh(ent[1].seq) - le32toh(ent[0].seq) == 1)
> +			old = 0;
> +		else
> +			old = 1;
> +	} else {
> +		if (le32toh(ent[0].seq) - le32toh(ent[1].seq) == 1)
> +			old = 1;
> +		else
> +			old = 0;
> +	}
> +
> +	return old;
> +}
> +
> +static int btt_log_read(struct arena_info *a, __u32 lane, struct log_entry *ent)
> +{
> +	int new_ent;
> +	struct log_entry log[2];
> +
> +	if (ent == NULL)
> +		return -EINVAL;
> +	btt_log_read_pair(a, lane, log);
> +	new_ent = 1 - btt_log_get_old(log);
> +	memcpy(ent, &log[new_ent], sizeof(struct log_entry));
> +	return 0;
> +}
> +
> +static uint64_t fletcher64(void *addr, size_t len, bool le)
> +{
> +	uint32_t *buf = addr;
> +	uint32_t lo32 = 0;
> +	uint64_t hi32 = 0;
> +	unsigned int i;
> +
> +	for (i = 0; i < len / sizeof(uint32_t); i++) {
> +		lo32 += le ? le32toh((__le32) buf[i]) : buf[i];
> +		hi32 += lo32;
> +	}
> +
> +	return ((uint64_t)(hi32 << 32)) | lo32;
> +}
> +
> +static int btt_checksum_verify(struct btt_sb *btt_sb)
> +{
> +	uint64_t sum;
> +	__le64 sum_save;
> +
> +	BUILD_BUG_ON(sizeof(struct btt_sb) != SZ_4K);
> +
> +	sum_save = btt_sb->checksum;
> +	btt_sb->checksum = 0;
> +	sum = fletcher64(btt_sb, sizeof(*btt_sb), 1);
> +	if (sum != sum_save)
> +		return 1;
> +	/* restore the checksum in the buffer */
> +	btt_sb->checksum = sum_save;
> +
> +	return 0;
> +}
> +
> +/*
> + * Never pass a mmapped buffer to this as it will attempt to write to
> + * the buffer, and we want writes to only happened in a controlled fashion.
> + * In the --dry-run case, even if such a buffer is passed, the write will
> + * result in a fault due to the readonly mmap flags.
> + */
> +static int btt_info_verify(struct btt_chk *bttc, struct btt_sb *btt_sb)
> +{
> +	if (memcmp(btt_sb->signature, BTT_SIG, BTT_SIG_LEN) != 0)
> +		return -ENXIO;
> +
> +	if (!uuid_is_null(btt_sb->parent_uuid))
> +		if (uuid_compare(bttc->parent_uuid, btt_sb->parent_uuid) != 0)
> +			return -ENXIO;
> +
> +	if (btt_checksum_verify(btt_sb))
> +		return -ENXIO;
> +
> +	return 0;
> +}
> +
> +static void btt_parse_meta(struct arena_info *arena, struct btt_sb *btt_sb,
> +				__u64 arena_off)
> +{
> +	arena->internal_nlba = le32toh(btt_sb->internal_nlba);
> +	arena->internal_lbasize = le32toh(btt_sb->internal_lbasize);
> +	arena->external_nlba = le32toh(btt_sb->external_nlba);
> +	arena->external_lbasize = le32toh(btt_sb->external_lbasize);
> +	arena->nfree = le32toh(btt_sb->nfree);
> +	arena->version_major = le16toh(btt_sb->version_major);
> +	arena->version_minor = le16toh(btt_sb->version_minor);
> +
> +	arena->nextoff = (btt_sb->nextoff == 0) ? 0 : (arena_off +
> +			le64toh(btt_sb->nextoff));
> +	arena->infooff = arena_off;
> +	arena->dataoff = arena_off + le64toh(btt_sb->dataoff);
> +	arena->mapoff = arena_off + le64toh(btt_sb->mapoff);
> +	arena->logoff = arena_off + le64toh(btt_sb->logoff);
> +	arena->info2off = arena_off + le64toh(btt_sb->info2off);
> +
> +	arena->size = (le64toh(btt_sb->nextoff) > 0)
> +		? (le64toh(btt_sb->nextoff))
> +		: (arena->info2off - arena->infooff + BTT_PG_SIZE);
> +
> +	arena->flags = le32toh(btt_sb->flags);
> +}
> +
> +static int btt_discover_arenas(struct btt_chk *bttc)
> +{
> +	int ret = 0;
> +	struct arena_info *arena;
> +	struct btt_sb *btt_sb;
> +	size_t remaining = bttc->rawsize;
> +	__u64 cur_nlba = 0;
> +	size_t cur_off = 0;
> +	int  i = 0;
> +
> +	btt_sb = calloc(1, sizeof(*btt_sb));
> +	if (!btt_sb)
> +		return -ENOMEM;
> +
> +	while (remaining) {
> +		/* Alloc memory for arena */
> +		arena = realloc(bttc->arena, (i + 1) * sizeof(*arena));
> +		if (!arena) {
> +			ret = -ENOMEM;
> +			goto out;
> +		} else {
> +			bttc->arena = arena;
> +			arena = &bttc->arena[i];
> +			/* zero the new memory */
> +			memset(arena, 0, sizeof(*arena));
> +		}
> +
> +		arena->infooff = cur_off;
> +		ret = btt_read_info(bttc, btt_sb, cur_off);
> +		if (ret)
> +			goto out;
> +
> +		if (btt_info_verify(bttc, btt_sb) != 0) {
> +			__u64 offset;
> +
> +			/* Try to find the backup info block */
> +			if (remaining <= ARENA_MAX_SIZE)
> +				offset = rounddown(bttc->rawsize, BTT_PG_SIZE) -
> +					2 * BTT_PG_SIZE;
> +			else
> +				offset = cur_off + ARENA_MAX_SIZE - BTT_PG_SIZE;
> +
> +			printf("Arena %d: Attempting recover info-block using info2\n", i);
> +			ret = btt_read_info(bttc, btt_sb, offset);
> +			if (ret) {
> +				error("Unable to read backup info block (offset %lld)\n",
> +					offset);
> +				goto out;
> +			}
> +			ret = btt_info_verify(bttc, btt_sb);
> +			if (ret) {
> +				error("Backup info block (offset %lld) verification failed\n",
> +					offset);
> +				goto out;
> +			}
> +			ret = btt_write_info(bttc, btt_sb, cur_off);
> +		}
> +
> +		arena->external_lba_start = cur_nlba;
> +		btt_parse_meta(arena, btt_sb, cur_off);
> +
> +		remaining -= arena->size;
> +		cur_off += arena->size;
> +		cur_nlba += arena->external_nlba;
> +		arena->num = i;
> +		i++;
> +
> +		if (arena->nextoff == 0)
> +			break;
> +	}
> +	bttc->num_arenas = i;
> +	bttc->nlba = cur_nlba;
> +	printf("found %d BTT arena%s\n", bttc->num_arenas,
> +		(bttc->num_arenas > 1) ? "s" : "");
> +	free(btt_sb);
> +	return ret;
> +
> + out:
> +	free(bttc->arena);
> +	free(btt_sb);
> +	return ret;
> +}
> +
> +static int btt_create_mappings(struct btt_chk *bttc)
> +{
> +	int open_flags, mmap_flags;
> +	struct arena_info *a;
> +	int fd, rc = 0, i;
> +
> +	if (dryrun) {
> +		open_flags = O_RDONLY;
> +		mmap_flags = PROT_READ;
> +	} else {
> +		open_flags = O_RDWR|O_EXCL;
> +		mmap_flags = PROT_READ|PROT_WRITE;
> +	}
> +
> +	fd = open(bttc->path, open_flags);
> +	if (fd < 0) {
> +		error("unable to open %s: %s\n", bttc->path, strerror(errno));
> +		return -errno;
> +	}
> +
> +	for (i = 0; i < bttc->num_arenas; i++) {
> +		a = &bttc->arena[i];
> +		a->map.info_len = BTT_PG_SIZE;
> +		a->map.info = mmap(NULL, a->map.info_len, mmap_flags,
> +			MAP_SHARED, fd, a->infooff + SZ_4K);
> +		if (a->map.info == MAP_FAILED) {
> +			rc = errno;
> +			error("mmap arena[%d].info [sz = %#lx, off = %#llx] failed: %d\n",
> +					i, a->map.info_len, a->infooff + SZ_4K, rc);
> +			goto out;
> +		}
> +
> +		a->map.data_len = a->mapoff - a->dataoff;
> +		a->map.data = mmap(NULL, a->map.data_len, mmap_flags,
> +			MAP_SHARED, fd, a->dataoff + SZ_4K);
> +		if (a->map.data == MAP_FAILED) {
> +			rc = errno;
> +			error("mmap arena[%d].data [sz = %#lx, off = %#llx] failed: %d\n",
> +					i, a->map.data_len, a->dataoff + SZ_4K, rc);
> +			goto out;
> +		}
> +
> +		a->map.map_len = a->logoff - a->mapoff;
> +		a->map.map = mmap(NULL, a->map.map_len, mmap_flags,
> +			MAP_SHARED, fd, a->mapoff + SZ_4K);
> +		if (a->map.map == MAP_FAILED) {
> +			rc = errno;
> +			error("mmap arena[%d].map [sz = %#lx, off = %#llx] failed: %d\n",
> +					i, a->map.map_len, a->mapoff + SZ_4K, rc);
> +			goto out;
> +		}
> +
> +		a->map.log_len = a->info2off - a->logoff;
> +		a->map.log = mmap(NULL, a->map.log_len, mmap_flags,
> +			MAP_SHARED, fd, a->logoff + SZ_4K);
> +		if (a->map.log == MAP_FAILED) {
> +			rc = errno;
> +			error("mmap arena[%d].log [sz = %#lx, off = %#llx] failed: %d\n",
> +					i, a->map.log_len, a->logoff + SZ_4K, rc);
> +			goto out;
> +		}
> +
> +		a->map.info2_len = BTT_PG_SIZE;
> +		a->map.info2 = mmap(NULL, a->map.info2_len, mmap_flags,
> +			MAP_SHARED, fd, a->info2off + SZ_4K);
> +		if (a->map.info2 == MAP_FAILED) {
> +			rc = errno;
> +			error("mmap arena[%d].info2 [sz = %#lx, off = %#llx] failed: %d\n",
> +					i, a->map.info2_len, a->info2off + SZ_4K, rc);
> +			goto out;
> +		}
> +	}
> +
> + out:
> +	close(fd);
> +	return rc;
> +}
> +
> +static void btt_remove_mappings(struct btt_chk *bttc)
> +{
> +	struct arena_info *a;
> +	int i;
> +
> +	for (i = 0; i < bttc->num_arenas; i++) {
> +		a = &bttc->arena[i];
> +		if (a->map.info)
> +			munmap(a->map.info, a->map.info_len);
> +		if (a->map.data)
> +			munmap(a->map.data, a->map.data_len);
> +		if (a->map.map)
> +			munmap(a->map.map, a->map.map_len);
> +		if (a->map.log)
> +			munmap(a->map.log, a->map.log_len);
> +		if (a->map.info2)
> +			munmap(a->map.info2, a->map.info2_len);
> +	}
> +}
> +
> +enum btt_errcodes {
> +	BTT_OK = 0,
> +	BTT_LOG_EQL_SEQ = 0x100,
> +	BTT_LOG_OOB_SEQ,
> +	BTT_LOG_OOB_LBA,
> +	BTT_LOG_OOB_OLD,
> +	BTT_LOG_OOB_NEW,
> +	BTT_LOG_MAP_ERR,
> +	BTT_MAP_OOB,
> +};
> +
> +static void btt_xlat_status(struct arena_info *a, int errcode)
> +{
> +	switch(errcode) {
> +	case BTT_OK:
> +		break;
> +	case BTT_LOG_EQL_SEQ:
> +		error("arena %d: found a pair of log entries with the same sequence number\n",
> +			a->num);
> +		break;
> +	case BTT_LOG_OOB_SEQ:
> +		error("arena %d: found a log entry with an out of bounds sequence number\n",
> +			a->num);
> +		break;
> +	case BTT_LOG_OOB_LBA:
> +		error("arena %d: found a log entry with an out of bounds LBA\n",
> +			a->num);
> +		break;
> +	case BTT_LOG_OOB_OLD:
> +		error("arena %d: found a log entry with an out of bounds 'old' mapping\n",
> +			a->num);
> +		break;
> +	case BTT_LOG_OOB_NEW:
> +		error("arena %d: found a log entry with an out of bounds 'new' mapping\n",
> +			a->num);
> +		break;
> +	case BTT_LOG_MAP_ERR:
> +		error("arena %d: found a log entry that does not match with a map entry\n",
> +			a->num);
> +		break;
> +	case BTT_MAP_OOB:
> +		error("arena %d: found a map entry that is out of bounds\n",
> +			a->num);
> +		break;
> +	default:
> +		error("arena %d: unknown error: %d\n", a->num, errcode);
> +	}
> +}
> +
> +/* Check that log entries are self consistent */
> +static int btt_check_log_entries(struct arena_info *a)
> +{
> +	unsigned int i;
> +	int rc = 0;
> +
> +	/*
> +	 * First, check both 'slots' for sequence numbers being distinct
> +	 * and in bounds
> +	 */
> +	for (i = 0; i < (2 * a->nfree); i+=2) {
> +		if (a->map.log[i].seq == a->map.log[i + 1].seq)
> +			return BTT_LOG_EQL_SEQ;
> +		if (a->map.log[i].seq > 3 || a->map.log[i + 1].seq > 3)
> +			return BTT_LOG_OOB_SEQ;
> +	}
> +	/*
> +	 * Next, check only the 'new' slot in each lane for the remaining
> +	 * entries being in bounds
> +	 */
> +	for (i = 0; i < a->nfree; i++) {
> +		struct log_entry log;
> +
> +		rc = btt_log_read(a, i, &log);
> +		if (rc)
> +			return rc;
> +
> +		if (log.lba >= a->external_nlba)
> +			return BTT_LOG_OOB_LBA;
> +		if (log.old_map >= a->internal_nlba)
> +			return BTT_LOG_OOB_OLD;
> +		if (log.new_map >= a->internal_nlba)
> +			return BTT_LOG_OOB_NEW;
> +	}
> +	return rc;
> +}
> +
> +/* Check that log entries are self consistent */
> +static int btt_check_map_entries(struct arena_info *a)
> +{
> +	int rc = 0, z, e;
> +	unsigned int i;
> +	__u32 mapping;
> +
> +	for (i = 0; i < a->external_nlba; i++) {
> +		rc = btt_map_read(a, i, &mapping, &z, &e);
> +		if (rc)
> +			return rc;
> +		if (mapping >= a->internal_nlba)
> +			return BTT_MAP_OOB;
> +	}
> +	return 0;
> +}
> +
> +/* Check that each flog entry has the correct corresponding map entry */
> +static int btt_check_log_map(struct arena_info *a)
> +{
> +	unsigned int i;
> +	__u32 mapping;
> +	int rc = 0;
> +
> +	for (i = 0; i < a->nfree; i++) {
> +		struct log_entry log;
> +
> +		rc = btt_log_read(a, i, &log);
> +		if (rc)
> +			return rc;
> +		rc = btt_map_read(a, log.lba, &mapping, NULL, NULL);
> +		if (rc)
> +			return rc;
> +
> +		/*
> +		 * Case where the flog was written, but map couldn't be updated.
> +		 * The kernel should also be able to detect and fix this condition
> +		 */
> +		if (log.new_map != mapping && log.old_map == mapping) {
> +			error("arena %d: log[%d].new_map (%#x) doesn't match map[%#x] (%#x)",
> +				a->num, i, log.new_map, log.lba, mapping);
> +			rc = btt_map_write(a, log.lba, log.new_map, 0, 0);
> +			if (rc)
> +				return BTT_LOG_MAP_ERR;
> +		}
> +	}
> +
> +	return rc;
> +}
> +
> +static int btt_check_info2(struct arena_info *a)
> +{
> +	/*
> +	 * Repair info2 if needed. The main info-block can be trusted
> +	 * as it has been verified during arena discovery
> +	 */
> +	if(memcmp(a->map.info2, a->map.info, BTT_PG_SIZE))
> +		return btt_copy_to_info2(a);
> +	return 0;
> +}
> +
> +static int btt_check_arenas(struct btt_chk *bttc)
> +{
> +	struct arena_info *a = NULL;
> +	int i, rc;
> +
> +	for(i = 0; i < bttc->num_arenas; i++) {
> +		printf("checking arena %d\n", i);
> +		a = &bttc->arena[i];
> +		rc = btt_check_log_entries(a);
> +		if (rc)
> +			break;
> +		rc = btt_check_map_entries(a);
> +		if (rc)
> +			break;
> +		rc = btt_check_log_map(a);
> +		if (rc)
> +			break;
> +		rc = btt_check_info2(a);
> +		if (rc)
> +			break;
> +	}
> +
> +	btt_xlat_status(a, rc);
> +	return rc;
> +}
> +
> +static bool is_namespace_offline(struct ndctl_namespace *ndns)
> +{
> +	struct ndctl_dax *dax = ndctl_namespace_get_dax(ndns);
> +	struct ndctl_btt *btt = ndctl_namespace_get_btt(ndns);
> +	struct ndctl_pfn *pfn = ndctl_namespace_get_pfn(ndns);
> +
> +	if (ndctl_namespace_is_enabled(ndns))
> +		return false;
> +
> +	if (dax && ndctl_dax_is_enabled(dax))
> +		return false;
> +
> +	if (btt && ndctl_btt_is_enabled(btt))
> +		return false;
> +
> +	if (pfn && ndctl_pfn_is_enabled(pfn))
> +		return false;
> +
> +	return true;
> +}
> +
> +static int btt_recover_first_sb(struct btt_chk *bttc)
> +{
> +	__u64 offset, remaining = bttc->rawsize;
> +	int rc, est_arenas = 0;
> +	struct btt_sb *btt_sb;
> +
> +	/* Estimate the number of arenas */
> +	while (remaining) {
> +		if (remaining < ARENA_MIN_SIZE && est_arenas == 0)
> +			return -EINVAL;
> +		if (remaining > ARENA_MAX_SIZE) {
> +			remaining -= ARENA_MAX_SIZE;
> +			est_arenas++;
> +			continue;
> +		}
> +		if (remaining < ARENA_MIN_SIZE)
> +			break;
> +		else {
> +			remaining = 0;
> +			est_arenas++;
> +			break;
> +		}
> +	}
> +
> +	btt_sb = malloc(2 * sizeof(*btt_sb));
> +	if (btt_sb == NULL)
> +		return -ENOMEM;
> +	/* Read the original first info block into btt_sb[0] */
> +	rc = btt_read_info(bttc, &btt_sb[0], 0);
> +	if (rc)
> +		goto out;
> +
> +	/* Attepmt 1: try recovery from expected end of the first arena */
> +	/*
> +	 * Offset calculation has to account for btt_read_info adding the
> +	 * extra 4K, so subtract it here
> +	 */
> +	if (est_arenas == 1)
> +		offset = rounddown(bttc->rawsize - remaining, BTT_PG_SIZE) -
> +			2 * BTT_PG_SIZE;
> +	else
> +		offset = ARENA_MAX_SIZE - 2 * BTT_PG_SIZE;
> +
> +	printf("Attempting recover info-block using info2 at offset %#llx\n", offset);
> +	rc = btt_read_info(bttc, &btt_sb[1], offset);
> +	if (rc)
> +		goto out;
> +	rc = btt_info_verify(bttc, &btt_sb[1]);
> +	if (rc == 0) {
> +		rc = btt_write_info(bttc, &btt_sb[1], 0);
> +		goto out;
> +	}
> +
> +	/*
> +	 * Attempt 2: From the very end of 'rawsize', try to copy the fields
> +	 * that are constant in every arena (only valid when multiple arenas
> +	 * are present)
> +	 */
> +	if (est_arenas > 1) {
> +		offset = rounddown(bttc->rawsize - remaining, BTT_PG_SIZE) -
> +			2 * BTT_PG_SIZE;
> +		printf("Attempting to recover info-block from end offset %#llx\n", offset);
> +		rc = btt_read_info(bttc, &btt_sb[1], offset);
> +		if (rc)
> +			goto out;
> +		/* copy over the arena0 specific fields from btt_sb[0] */
> +		btt_sb[1].flags = btt_sb[0].flags;
> +		btt_sb[1].external_nlba = btt_sb[0].external_nlba;
> +		btt_sb[1].internal_nlba = btt_sb[0].internal_nlba;
> +		btt_sb[1].nextoff = btt_sb[0].nextoff;
> +		btt_sb[1].dataoff = btt_sb[0].dataoff;
> +		btt_sb[1].mapoff = btt_sb[0].mapoff;
> +		btt_sb[1].logoff = btt_sb[0].logoff;
> +		btt_sb[1].info2off = btt_sb[0].info2off;
> +		btt_sb[1].checksum = btt_sb[0].checksum;
> +		rc = btt_info_verify(bttc, &btt_sb[1]);
> +		if (rc == 0) {
> +			rc = btt_write_info(bttc, &btt_sb[1], 0);
> +			goto out;
> +		}
> +	}
> +
> +	/*
> +	 * Attempt 3: use info2off as-is, and check if we find a valid info
> +	 * block at that location.
> +	 */
> +	 offset = le32toh(btt_sb[0].info2off);
> +	 if (offset) {
> +		printf("Attempting to recover info-block from info2 offset (as-is) %#llx\n", offset);
> +		rc = btt_read_info(bttc, &btt_sb[1], offset);
> +		if (rc)
> +			goto out;
> +		rc = btt_info_verify(bttc, &btt_sb[1]);
> +		if (rc == 0) {
> +			rc = btt_write_info(bttc, &btt_sb[1], 0);
> +			goto out;
> +		}
> +	} else
> +		rc = -ENXIO;
> + out:
> +	free(btt_sb);
> +	return rc;
> +}
> +
> +static int namespace_check(struct ndctl_region *region,
> +		struct ndctl_namespace *ndns)
> +{
> +	const char *devname = ndctl_namespace_get_devname(ndns);
> +	struct btt_chk *bttc;
> +	struct btt_sb *btt_sb;
> +	int raw_mode, rc;
> +	char path[50];

If we only know how to check namespaces that are in sector mode,
should you check up front and then bail out with an appropriate message
if the namespace isn't in sector mode?  Otherwise, it just seems
like we fail later with a message that isn't very helpful.

> +
> +	printf("checking %s\n", devname);
> +	if (!is_namespace_offline(ndns)) {
> +		error("%s: check aborted, namespace online\n", devname);
> +		return -EBUSY;
> +	}
> +
> +	raw_mode = ndctl_namespace_get_raw_mode(ndns);
> +	ndctl_namespace_set_raw_mode(ndns, 1);
> +	rc = ndctl_namespace_enable(ndns);
> +	if (rc != 0)
> +		error("%s: failed to enable raw mode\n", devname);

The namespace is now enabled in raw mode.  How do we prevent something
else from seeing the device and opening it?  You open it later but not
in exclusive mode, and it gets opened and closed a few times leaving
windows where something else could open it.  Or is something else
protecting the device?

> +	ndctl_namespace_set_raw_mode(ndns, raw_mode);

What is this second call doing?

> +	sprintf(path, "/dev/%s", ndctl_namespace_get_block_device(ndns));
> +
> +	btt_sb = malloc(sizeof(*btt_sb));
> +	if (btt_sb == NULL) {
> +		rc = -ENOMEM;
> +		goto out;
> +	}
> +
> +	bttc = calloc(1, sizeof(*bttc));
> +	if (bttc == NULL) {
> +		rc = -ENOMEM;
> +		goto out_sb;
> +	}
> +	bttc->path = path;
> +	bttc->rawsize = ndctl_namespace_get_size(ndns);
> +	ndctl_namespace_get_uuid(ndns, bttc->parent_uuid);
> +
> +	rc = btt_read_info(bttc, btt_sb, 0);
> +	if (rc)
> +		goto out_bttc;
> +	rc = btt_info_verify(bttc, btt_sb);
> +	if (rc) {
> +		rc = btt_recover_first_sb(bttc);
> +		if (rc) {
> +			error("Unable to recover any BTT info blocks, aborting\n");
> +			goto out_bttc;
> +		}
> +		rc = btt_read_info(bttc, btt_sb, 0);
> +		if (rc)
> +			goto out_bttc;
> +	}
> +	rc = btt_discover_arenas(bttc);
> +	if (rc)
> +		goto out_bttc;
> +
> +	rc = btt_create_mappings(bttc);
> +	if (rc)
> +		goto out_bttc;
> +
> +	rc = btt_check_arenas(bttc);
> +
> + out_bttc:
> +	btt_remove_mappings(bttc);
> +	free(bttc);
> + out_sb:
> +	free(btt_sb);
> + out:
> +	ndctl_namespace_disable(ndns);

Since you put the namespace into raw mode, do you have to put it back
into sector mode?

> +	return rc;
> +}
> +
> +#else
> +static int namespace_check(struct ndctl_region *region,
> +		struct ndctl_namespace *ndns)
> +{
> +	return -ENOTTY;
> +}
> +#endif
> +
>  static int do_xaction_namespace(const char *namespace,
>  		enum namespace_action action, struct ndctl_ctx *ctx)
>  {
> @@ -765,6 +1646,9 @@ static int do_xaction_namespace(const char *namespace,
>  				case ACTION_DESTROY:
>  					rc = namespace_destroy(region, ndns);
>  					break;
> +				case ACTION_CHECK:
> +					rc = namespace_check(region, ndns);
> +					break;
>  				case ACTION_CREATE:
>  					rc = namespace_reconfig(region, ndns);
>  					if (rc < 0)
> @@ -873,3 +1757,25 @@ int cmd_destroy_namespace(int argc , const char **argv, void *ctx)
>  		return 0;
>  	}
>  }
> +
> +int cmd_check_namespace(int argc , const char **argv, struct ndctl_ctx *ctx)
> +{
> +	char *xable_usage = "ndctl check-namespace <namespace> [<options>]";
> +	const char *namespace = parse_namespace_options(argc, argv,
> +			ACTION_CHECK, check_options, xable_usage);
> +	int checked;
> +
> +	checked = do_xaction_namespace(namespace, ACTION_CHECK, ctx);
> +	if (checked < 0) {
> +		fprintf(stderr, "error checking namespaces: %s\n",
> +				strerror(-checked));
> +		return checked;
> +	} else if (checked == 0) {
> +		fprintf(stderr, "checked 0 namespaces\n");
> +		return 0;
> +	} else {
> +		fprintf(stderr, "checked %d namespace%s\n", checked,
> +				checked > 1 ? "s" : "");
> +		return 0;
> +	}
> +}
> diff --git a/ndctl/ndctl.c b/ndctl/ndctl.c
> index 80a0491..ec018c7 100644
> --- a/ndctl/ndctl.c
> +++ b/ndctl/ndctl.c
> @@ -57,6 +57,9 @@ static struct cmd_struct commands[] = {
>  	{ "disable-namespace", cmd_disable_namespace },
>  	{ "create-namespace", cmd_create_namespace },
>  	{ "destroy-namespace", cmd_destroy_namespace },
> +	#ifdef ENABLE_CHECK_NAMESPACE
> +	{ "check-namespace", cmd_check_namespace },
> +	#endif
>  	{ "enable-region", cmd_enable_region },
>  	{ "disable-region", cmd_disable_region },
>  	{ "enable-dimm", cmd_enable_dimm },
> diff --git a/util/util.h b/util/util.h
> index e0e5f26..d280d10 100644
> --- a/util/util.h
> +++ b/util/util.h
> @@ -23,6 +23,13 @@
>  
>  #define alloc_nr(x) (((x)+16)*3/2)
>  
> +#define rounddown(x, y) (				\
> +{							\
> +	typeof(x) __x = (x);				\
> +	__x - (__x % (y));				\
> +}							\
> +)
> +
>  /*
>   * Realloc the buffer pointed at by variable 'x' so that it can hold
>   * at least 'nr' entries; the number of entries currently allocated
> @@ -44,6 +51,7 @@
>  #define zfree(ptr) ({ free(*ptr); *ptr = NULL; })
>  
>  #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
> +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
>  
>  static inline const char *skip_prefix(const char *str, const char *prefix)
>  {
> 

_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ndctl: add a BTT check utility
       [not found] ` <20170120031234.24226-1-vishal.l.verma-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
@ 2017-01-20 23:14   ` Jeff Moyer
       [not found]     ` <x49lgu52uk0.fsf-RRHT56Q3PSP4kTEheFKJxxDDeQx5vsVwAInAS/Ez/D0@public.gmane.org>
  0 siblings, 1 reply; 11+ messages in thread
From: Jeff Moyer @ 2017-01-20 23:14 UTC (permalink / raw)
  To: Vishal Verma; +Cc: linux-nvdimm-y27Ovi1pjclAfugRpC6u6w

Hi, Vishal,

Vishal Verma <vishal.l.verma-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> writes:

> Add the check-namespace command to ndctl. This will check the BTT
> metadata layout for the given namespace, and if requested, correct any
> errors found. Not all metadata corruption is detectable or fixable.

First, thanks a lot for tackling this!  Overall, the code looks pretty
clean.  I've got a few general comments, and then you can scroll down
for code review.  I didn't get through the whole thing, so I'll try to
pick up where I left off next week.

How was this tested?  Did you perturb the btt in ways that you expect
can and can't be fixed?  Can you add tests for that?

Checking should not be an optional feature.  Just always build it in.

Why should a user have to disable a namespace in order to run the
checker?  I guess that's the analog of unmounting an fs?  Still, seems
like we could just do that for the user.

Can this code in any way be shared with libpmemblk?  They have their own
checker (which does different things), and it seems like a lost
opportunity to not be able to leverage the same code in both places.

Overall, the code could use more comments.

Does this really all belong in builtin-xaction-namespace.c?  It seems
too btt specific for that.  Maybe it's time to create a new file.
I'll leave that up to Dan for his preference.

There seems to be a lot of repeated code sequences around reading and
verifying the btt info blocks.  Perhaps that can be factored out?  Maybe
get rid of the first_sb special snowflake and roll that into the arena
discovery?

> diff --git a/ndctl/btt-structs.h b/ndctl/btt-structs.h

> +#define IB_FLAG_ERROR 0x00000001
> +#define IB_FLAG_ERROR_MASK 0x00000001

IB_FLAG_ERROR is defined but never used/checked.  Of course, neither the
kernel nor this utility will set that flag.  We should probably decide
whether it makes sense to do so.

> +static int btt_read_info(struct btt_chk *bttc, struct btt_sb *btt_sb, __u64 off)
> +{
> +	int fd, rc = 0;
> +	ssize_t size;
> +
> +	fd = open(bttc->path, O_RDONLY);
> +	if (fd < 0) {
> +		error("unable to open %s: %s\n", bttc->path, strerror(errno));
> +		return -errno;
> +	}
> +
> +	size = pread(fd, btt_sb, sizeof(*btt_sb), off + SZ_4K);
> +	if (size != sizeof(*btt_sb)) {
> +		error("unable to read first info block: %s\n", strerror(errno));
> +		rc = -errno;

Short reads will not set errno, so you'll return 0 incorrectly, here.

> +	}
> +	close(fd);
> +	return rc;
> +}

I find it odd that the caller has to pass in the offset 4k before the
start of the info block.  It seems natural for the first info block, but
for the backup, it's totally bogus.  ;-)

> +static int btt_copy_to_info2(struct arena_info *a)
> +{
> +	if (dryrun) {
> +		error("Arena %d: BTT info2 needs to be restored\n", a->num);
> +		return dryrun_msg();
> +	}
> +	printf("Arena %d: Restoring BTT info2\n", a->num);
> +	memcpy(a->map.info2, a->map.info, BTT_PG_SIZE);

missing msync();

> +static int btt_map_read(struct arena_info *a, __u32 lba, __u32 *mapping,
> +			int *trim, int *error)
> +{
> +	__u32 raw_mapping, postmap, ze, z_flag, e_flag;
> +
> +	raw_mapping = le32toh(a->map.map[lba]);
> +	z_flag = (raw_mapping & MAP_TRIM_MASK) >> MAP_TRIM_SHIFT;
> +	e_flag = (raw_mapping & MAP_ERR_MASK) >> MAP_ERR_SHIFT;
> +	ze = (z_flag << 1) + e_flag;
> +	postmap = raw_mapping & MAP_LBA_MASK;
> +
> +	/* Reuse the {z,e}_flag variables for *trim and *error */
> +	z_flag = 0;
> +	e_flag = 0;
> +
> +	switch (ze) {
> +	case 0:
> +		/* Initial state. Return postmap = premap */
> +		*mapping = lba;
> +		break;
> +	case 1:
> +		*mapping = postmap;
> +		e_flag = 1;
> +		break;
> +	case 2:
> +		*mapping = postmap;
> +		z_flag = 1;
> +		break;
> +	case 3:
> +		*mapping = postmap;
> +		break;
> +	default:
> +		return -EIO;
> +	}
> +
> +	if (trim)
> +		*trim = z_flag;
> +	if (error)
> +		*error = e_flag;
> +
> +	return 0;
> +}

No caller uses z or e, so get rid of those args.  Also, your default
entry in the switch statement can never be hit.  Fixing those, the
function is just:

/*
 * btt_map_lookup
 *
 * Given a pre-map Arena Block Address, return the post-map ABA.
 */
static u32 btt_map_lookup(struct arena_info *a, __u32 lba)
{
        return (le32toh(a->map.map[lba]) & MAP_LBA_MASK);
}

You can take or leave the function name change.  Please keep the
comment, and maybe kernel-doc-ify it.

> +static int btt_map_write(struct arena_info *a, __u32 lba, __u32 mapping,
> +			__u32 z_flag, __u32 e_flag)
> +{

Again, z and e are unused.  Nuke 'em.

[snip]
> +	a->map.map[lba] = htole32(mapping);
> +	if (msync((void *)rounddown((__u64)&a->map.map[lba], BTT_PG_SIZE),
> +		BTT_PG_SIZE, MS_SYNC) < 0)
> +		return errno;

You are aligning msync calls to the BTT_PG_SIZE, which is wrong.  It
should be aligned to the system page size.

In fact, you abuse BTT_PG_SIZE all over the place.  You use it to mean:
- system page size
- size of the btt info block
- the btt page size ;-)

It would be nice if you could clean that up.

Should we perform a load of the info block after the msync to ensure it
was written (to trigger an MCE if one is going to happen)?  That same
question could be asked for each place where an update is made.

> +static void btt_parse_meta(struct arena_info *arena, struct btt_sb *btt_sb,
> +				__u64 arena_off)
> +{
> +	arena->internal_nlba = le32toh(btt_sb->internal_nlba);
> +	arena->internal_lbasize = le32toh(btt_sb->internal_lbasize);
> +	arena->external_nlba = le32toh(btt_sb->external_nlba);
> +	arena->external_lbasize = le32toh(btt_sb->external_lbasize);
> +	arena->nfree = le32toh(btt_sb->nfree);
> +	arena->version_major = le16toh(btt_sb->version_major);
> +	arena->version_minor = le16toh(btt_sb->version_minor);
> +
> +	arena->nextoff = (btt_sb->nextoff == 0) ? 0 : (arena_off +
> +			le64toh(btt_sb->nextoff));
> +	arena->infooff = arena_off;
> +	arena->dataoff = arena_off + le64toh(btt_sb->dataoff);
> +	arena->mapoff = arena_off + le64toh(btt_sb->mapoff);
> +	arena->logoff = arena_off + le64toh(btt_sb->logoff);
> +	arena->info2off = arena_off + le64toh(btt_sb->info2off);
> +
> +	arena->size = (le64toh(btt_sb->nextoff) > 0)
> +		? (le64toh(btt_sb->nextoff))
> +		: (arena->info2off - arena->infooff + BTT_PG_SIZE);
> +
> +	arena->flags = le32toh(btt_sb->flags);
> +}

Maybe it doesn't belong in the parse routine, but there is no sanity
checking on this data, and there really should be.

> +/* Check that log entries are self consistent */
> +static int btt_check_map_entries(struct arena_info *a)
> +{
> +	int rc = 0, z, e;
> +	unsigned int i;
> +	__u32 mapping;
> +
> +	for (i = 0; i < a->external_nlba; i++) {
> +		rc = btt_map_read(a, i, &mapping, &z, &e);
> +		if (rc)
> +			return rc;
> +		if (mapping >= a->internal_nlba)
> +			return BTT_MAP_OOB;
> +	}
> +	return 0;
> +}

libpmemblk's version of this keeps a bitmap of allocated blocks and free
blocks (from the flog), and then generates a warning if there are
duplicate blocks allocated or unreachable blocks.  I think we should do
the same.


> +/* Check that each flog entry has the correct corresponding map entry */
> +static int btt_check_log_map(struct arena_info *a)
> +{
> +	unsigned int i;
> +	__u32 mapping;
> +	int rc = 0;
> +
> +	for (i = 0; i < a->nfree; i++) {
> +		struct log_entry log;
> +
> +		rc = btt_log_read(a, i, &log);
> +		if (rc)
> +			return rc;
> +		rc = btt_map_read(a, log.lba, &mapping, NULL, NULL);
> +		if (rc)
> +			return rc;
> +
> +		/*
> +		 * Case where the flog was written, but map couldn't be updated.
> +		 * The kernel should also be able to detect and fix this condition
> +		 */
> +		if (log.new_map != mapping && log.old_map == mapping) {
> +			error("arena %d: log[%d].new_map (%#x) doesn't match map[%#x] (%#x)",
> +				a->num, i, log.new_map, log.lba, mapping);

This isn't an error.  We expect this to happen if power is suddenly
lost, and the format is built to recover from it.  If you log it at all,
I would make it an informational message.

I wonder whether we should even fix it.  I worry about having the same
recovery code in multiple places (the kernel and here).  What do you
think?

> +static int btt_recover_first_sb(struct btt_chk *bttc)
> +{
[snip]
> +	/*
> +	 * Attempt 2: From the very end of 'rawsize', try to copy the fields
> +	 * that are constant in every arena (only valid when multiple arenas
> +	 * are present)
> +	 */
> +	if (est_arenas > 1) {
> +		offset = rounddown(bttc->rawsize - remaining, BTT_PG_SIZE) -
> +			2 * BTT_PG_SIZE;
> +		printf("Attempting to recover info-block from end offset %#llx\n", offset);
> +		rc = btt_read_info(bttc, &btt_sb[1], offset);
> +		if (rc)
> +			goto out;

How about we verify the info block before blindlly copying in the data?
I know you verify the result, but it's weird.

> +		/* copy over the arena0 specific fields from btt_sb[0] */
> +		btt_sb[1].flags = btt_sb[0].flags;
> +		btt_sb[1].external_nlba = btt_sb[0].external_nlba;
> +		btt_sb[1].internal_nlba = btt_sb[0].internal_nlba;
> +		btt_sb[1].nextoff = btt_sb[0].nextoff;
> +		btt_sb[1].dataoff = btt_sb[0].dataoff;
> +		btt_sb[1].mapoff = btt_sb[0].mapoff;
> +		btt_sb[1].logoff = btt_sb[0].logoff;
> +		btt_sb[1].info2off = btt_sb[0].info2off;
> +		btt_sb[1].checksum = btt_sb[0].checksum;
> +		rc = btt_info_verify(bttc, &btt_sb[1]);
> +		if (rc == 0) {
> +			rc = btt_write_info(bttc, &btt_sb[1], 0);
> +			goto out;
> +		}
> +	}
> +
> +	/*
> +	 * Attempt 3: use info2off as-is, and check if we find a valid info
> +	 * block at that location.
> +	 */
> +	 offset = le32toh(btt_sb[0].info2off);
> +	 if (offset) {

Maybe do some bounds checking on offset?

> +		printf("Attempting to recover info-block from info2 offset (as-is) %#llx\n", offset);
> +		rc = btt_read_info(bttc, &btt_sb[1], offset);
> +		if (rc)
> +			goto out;
> +		rc = btt_info_verify(bttc, &btt_sb[1]);
> +		if (rc == 0) {
> +			rc = btt_write_info(bttc, &btt_sb[1], 0);
> +			goto out;
> +		}
> +	} else
> +		rc = -ENXIO;

You could also look for the primary info blocks for the other arenas.

> + out:
> +	free(btt_sb);
> +	return rc;
> +}
> +
> +static int namespace_check(struct ndctl_region *region,
> +		struct ndctl_namespace *ndns)
> +{

namespace_check sounds generic, but it really only works for the btt.
Also, should we verify that the namespace was configured with a btt
before even attempting to check it?

> +	raw_mode = ndctl_namespace_get_raw_mode(ndns);
> +	ndctl_namespace_set_raw_mode(ndns, 1);
> +	rc = ndctl_namespace_enable(ndns);
> +	if (rc != 0)
> +		error("%s: failed to enable raw mode\n", devname);
> +	ndctl_namespace_set_raw_mode(ndns, raw_mode);

What the heck do the above 6 lines do?  Please explain.

Thanks!
Jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ndctl: add a BTT check utility
       [not found]     ` <x49lgu52uk0.fsf-RRHT56Q3PSP4kTEheFKJxxDDeQx5vsVwAInAS/Ez/D0@public.gmane.org>
@ 2017-01-21  0:41       ` Verma, Vishal L
       [not found]         ` <1484959224.4857.6.camel-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
  2017-01-31  0:12       ` Verma, Vishal L
  1 sibling, 1 reply; 11+ messages in thread
From: Verma, Vishal L @ 2017-01-21  0:41 UTC (permalink / raw)
  To: jmoyer-H+wXaHxf7aLQT0dZR+AlfA; +Cc: linux-nvdimm-y27Ovi1pjclAfugRpC6u6w

On Fri, 2017-01-20 at 18:14 -0500, Jeff Moyer wrote:
> Hi, Vishal,
> 
> Vishal Verma <vishal.l.verma@intel.com> writes:
> 
> > Add the check-namespace command to ndctl. This will check the BTT
> > metadata layout for the given namespace, and if requested, correct
> > any
> > errors found. Not all metadata corruption is detectable or fixable.
> 
> First, thanks a lot for tackling this!  Overall, the code looks pretty
> clean.  I've got a few general comments, and then you can scroll down
> for code review.  I didn't get through the whole thing, so I'll try to
> pick up where I left off next week.

Thanks Jeff!

> 
> How was this tested?  Did you perturb the btt in ways that you expect
> can and can't be fixed?  Can you add tests for that?

Yes, so for some things I tested by essentially hand editing the
metadata. But for other things - mainly multi-arena testing, and the
different recovery attempts for first_sb, I modified the kernel btt to
produce small 8MB arenas, and did the same in the checker. For the
former, those can be 'formalized' as ndctl unit tests, and I will start
doing that. But for the latter, I don't know if there's an easy way to
get full coverage on those cases.

> 
> Checking should not be an optional feature.  Just always build it in.

You mean checking metadata when bringing up the namespace? The kernel
does essential checks of course, but do you mean the more involved ones
like log/map entry bounds, the backup info block etc?

> 
> Why should a user have to disable a namespace in order to run the
> checker?  I guess that's the analog of unmounting an fs?  Still, seems
> like we could just do that for the user.

Mostly so that the user can be sure nothing else is using it when the
check starts? We could disable it for them, but I just thought this
would be safer? An option could be providing a --force option like I
think destroy or reconfig do..

> 
> Can this code in any way be shared with libpmemblk?  They have their
> own
> checker (which does different things), and it seems like a lost
> opportunity to not be able to leverage the same code in both places.

We started out with that, in fact with depending on libpmemblk's checker
and calling into their library, but it was odd for ndctl (a core system
library of sorts) to depend on libpmemblk (an 'application' library).
Then we considered moving the libpmemblk checker parts into libndctl,
and making pmemblk depend on us, but they also used their library on
windows, so we couldn't just move it. Finally we just decided to have a
copy of the code in ndctl, so I rewrote it in a ndctl native way.

> 
> Overall, the code could use more comments.

Ok, I'll add some descriptions to the trickier sections.

> 
> Does this really all belong in builtin-xaction-namespace.c?  It seems
> too btt specific for that.  Maybe it's time to create a new file.
> I'll leave that up to Dan for his preference.

Dan had the same feedback, will move to a new file.

> 
> There seems to be a lot of repeated code sequences around reading and
> verifying the btt info blocks.  Perhaps that can be factored
> out?  Maybe
> get rid of the first_sb special snowflake and roll that into the arena
> discovery?
> 
> > diff --git a/ndctl/btt-structs.h b/ndctl/btt-structs.h
> > +#define IB_FLAG_ERROR 0x00000001
> > +#define IB_FLAG_ERROR_MASK 0x00000001
> 
> IB_FLAG_ERROR is defined but never used/checked.  Of course, neither
> the
> kernel nor this utility will set that flag.  We should probably decide
> whether it makes sense to do so.

True, will remove.

> 
> > +static int btt_read_info(struct btt_chk *bttc, struct btt_sb
> > *btt_sb, __u64 off)
> > +{
> > +	int fd, rc = 0;
> > +	ssize_t size;
> > +
> > +	fd = open(bttc->path, O_RDONLY);
> > +	if (fd < 0) {
> > +		error("unable to open %s: %s\n", bttc->path,
> > strerror(errno));
> > +		return -errno;
> > +	}
> > +
> > +	size = pread(fd, btt_sb, sizeof(*btt_sb), off + SZ_4K);
> > +	if (size != sizeof(*btt_sb)) {
> > +		error("unable to read first info block: %s\n",
> > strerror(errno));
> > +		rc = -errno;
> 
> Short reads will not set errno, so you'll return 0 incorrectly, here.

Good catch, I'll fix.

> 
> > +	}
> > +	close(fd);
> > +	return rc;
> > +}
> 
> I find it odd that the caller has to pass in the offset 4k before the
> start of the info block.  It seems natural for the first info block,
> but
> for the backup, it's totally bogus.  ;-)

I agree, and I hated it too :)
But I wanted to keep the +4K addition in the lowest level functions..
Have a better idea? :)  This does kinda kill the idea of "oh the 4k
offsetting will be completely hidden from the higher level logic"

The opposite of this would be sprinkling a number of +4K everywhere
else, and I think with this we only have to do the inverse in a couple
of places..

> 
> > +static int btt_copy_to_info2(struct arena_info *a)
> > +{
> > +	if (dryrun) {
> > +		error("Arena %d: BTT info2 needs to be restored\n",
> > a->num);
> > +		return dryrun_msg();
> > +	}
> > +	printf("Arena %d: Restoring BTT info2\n", a->num);
> > +	memcpy(a->map.info2, a->map.info, BTT_PG_SIZE);
> 
> missing msync();

Good catch.

> 
> > +static int btt_map_read(struct arena_info *a, __u32 lba, __u32
> > *mapping,
> > +			int *trim, int *error)
> > +{
> > +	__u32 raw_mapping, postmap, ze, z_flag, e_flag;
> > +
> > +	raw_mapping = le32toh(a->map.map[lba]);
> > +	z_flag = (raw_mapping & MAP_TRIM_MASK) >> MAP_TRIM_SHIFT;
> > +	e_flag = (raw_mapping & MAP_ERR_MASK) >> MAP_ERR_SHIFT;
> > +	ze = (z_flag << 1) + e_flag;
> > +	postmap = raw_mapping & MAP_LBA_MASK;
> > +
> > +	/* Reuse the {z,e}_flag variables for *trim and *error */
> > +	z_flag = 0;
> > +	e_flag = 0;
> > +
> > +	switch (ze) {
> > +	case 0:
> > +		/* Initial state. Return postmap = premap */
> > +		*mapping = lba;
> > +		break;
> > +	case 1:
> > +		*mapping = postmap;
> > +		e_flag = 1;
> > +		break;
> > +	case 2:
> > +		*mapping = postmap;
> > +		z_flag = 1;
> > +		break;
> > +	case 3:
> > +		*mapping = postmap;
> > +		break;
> > +	default:
> > +		return -EIO;
> > +	}
> > +
> > +	if (trim)
> > +		*trim = z_flag;
> > +	if (error)
> > +		*error = e_flag;
> > +
> > +	return 0;
> > +}
> 
> No caller uses z or e, so get rid of those args.  Also, your default
> entry in the switch statement can never be hit.  Fixing those, the
> function is just:
> 
> /*
>  * btt_map_lookup
>  *
>  * Given a pre-map Arena Block Address, return the post-map ABA.
>  */
> static u32 btt_map_lookup(struct arena_info *a, __u32 lba)
> {
>         return (le32toh(a->map.map[lba]) & MAP_LBA_MASK);
> }
> 
> You can take or leave the function name change.  Please keep the
> comment, and maybe kernel-doc-ify it.
> 
> > +static int btt_map_write(struct arena_info *a, __u32 lba, __u32
> > mapping,
> > +			__u32 z_flag, __u32 e_flag)
> > +{
> 
> Again, z and e are unused.  Nuke 'em.

True, I just copied these over from the kernel implementation, and
wasn't sure if I'd ever be using z and e. Currently the kernel doesn't
even set those, so it is a bit redundant. If the kernel does grow the
ability to use them in future, there could be use for these, but I
suppose I should remove the dead code until that happens :)

> 
> [snip]
> > +	a->map.map[lba] = htole32(mapping);
> > +	if (msync((void *)rounddown((__u64)&a->map.map[lba],
> > BTT_PG_SIZE),
> > +		BTT_PG_SIZE, MS_SYNC) < 0)
> > +		return errno;
> 
> You are aligning msync calls to the BTT_PG_SIZE, which is wrong.  It
> should be aligned to the system page size.
> 
> In fact, you abuse BTT_PG_SIZE all over the place.  You use it to
> mean:
> - system page size
> - size of the btt info block
> - the btt page size ;-)
> 
> It would be nice if you could clean that up.

Yes.. I was just (ab)using the fact that they're going to be the same,
but I should distinguish between the different usages better. I'll fix
it up.

> 
> Should we perform a load of the info block after the msync to ensure
> it
> was written (to trigger an MCE if one is going to happen)?  That same
> question could be asked for each place where an update is made.

Good question. I guess the probability of those locations having a bad
cell is low because we did just read them to find out there is a
problem, and subsequently writing them should have a low probability of
triggering a CMCI. But re-reading to make sure it was correctly written
is not a bad idea. Dan, thoughts?

> 
> > +static void btt_parse_meta(struct arena_info *arena, struct btt_sb
> > *btt_sb,
> > +				__u64 arena_off)
> > +{
> > +	arena->internal_nlba = le32toh(btt_sb->internal_nlba);
> > +	arena->internal_lbasize = le32toh(btt_sb-
> > >internal_lbasize);
> > +	arena->external_nlba = le32toh(btt_sb->external_nlba);
> > +	arena->external_lbasize = le32toh(btt_sb-
> > >external_lbasize);
> > +	arena->nfree = le32toh(btt_sb->nfree);
> > +	arena->version_major = le16toh(btt_sb->version_major);
> > +	arena->version_minor = le16toh(btt_sb->version_minor);
> > +
> > +	arena->nextoff = (btt_sb->nextoff == 0) ? 0 : (arena_off +
> > +			le64toh(btt_sb->nextoff));
> > +	arena->infooff = arena_off;
> > +	arena->dataoff = arena_off + le64toh(btt_sb->dataoff);
> > +	arena->mapoff = arena_off + le64toh(btt_sb->mapoff);
> > +	arena->logoff = arena_off + le64toh(btt_sb->logoff);
> > +	arena->info2off = arena_off + le64toh(btt_sb->info2off);
> > +
> > +	arena->size = (le64toh(btt_sb->nextoff) > 0)
> > +		? (le64toh(btt_sb->nextoff))
> > +		: (arena->info2off - arena->infooff + BTT_PG_SIZE);
> > +
> > +	arena->flags = le32toh(btt_sb->flags);
> > +}
> 
> Maybe it doesn't belong in the parse routine, but there is no sanity
> checking on this data, and there really should be.

Good point, I'll add some.

> 
> > +/* Check that log entries are self consistent */
> > +static int btt_check_map_entries(struct arena_info *a)
> > +{
> > +	int rc = 0, z, e;
> > +	unsigned int i;
> > +	__u32 mapping;
> > +
> > +	for (i = 0; i < a->external_nlba; i++) {
> > +		rc = btt_map_read(a, i, &mapping, &z, &e);
> > +		if (rc)
> > +			return rc;
> > +		if (mapping >= a->internal_nlba)
> > +			return BTT_MAP_OOB;
> > +	}
> > +	return 0;
> > +}
> 
> libpmemblk's version of this keeps a bitmap of allocated blocks and
> free
> blocks (from the flog), and then generates a warning if there are
> duplicate blocks allocated or unreachable blocks.  I think we should
> do
> the same.

Yep I saw that, I'll add it.

> 
> 
> > +/* Check that each flog entry has the correct corresponding map
> > entry */
> > +static int btt_check_log_map(struct arena_info *a)
> > +{
> > +	unsigned int i;
> > +	__u32 mapping;
> > +	int rc = 0;
> > +
> > +	for (i = 0; i < a->nfree; i++) {
> > +		struct log_entry log;
> > +
> > +		rc = btt_log_read(a, i, &log);
> > +		if (rc)
> > +			return rc;
> > +		rc = btt_map_read(a, log.lba, &mapping, NULL,
> > NULL);
> > +		if (rc)
> > +			return rc;
> > +
> > +		/*
> > +		 * Case where the flog was written, but map
> > couldn't be updated.
> > +		 * The kernel should also be able to detect and fix
> > this condition
> > +		 */
> > +		if (log.new_map != mapping && log.old_map ==
> > mapping) {
> > +			error("arena %d: log[%d].new_map (%#x)
> > doesn't match map[%#x] (%#x)",
> > +				a->num, i, log.new_map, log.lba,
> > mapping);
> 
> This isn't an error.  We expect this to happen if power is suddenly
> lost, and the format is built to recover from it.  If you log it at
> all,
> I would make it an informational message.
> 
> I wonder whether we should even fix it.  I worry about having the same
> recovery code in multiple places (the kernel and here).  What do you
> think?

Hm, I agree about the message, in that it is routine. But I added the
repairing so that a user may just decide to do a check using this after
a power failure, and we can easily fix it when we have the chance. I do
agree about the check/correction in multiple places, I wouldn't be too
opposed to removing it entirely from here.

> 
> > +static int btt_recover_first_sb(struct btt_chk *bttc)
> > +{
> 
> [snip]
> > +	/*
> > +	 * Attempt 2: From the very end of 'rawsize', try to copy
> > the fields
> > +	 * that are constant in every arena (only valid when
> > multiple arenas
> > +	 * are present)
> > +	 */
> > +	if (est_arenas > 1) {
> > +		offset = rounddown(bttc->rawsize - remaining,
> > BTT_PG_SIZE) -
> > +			2 * BTT_PG_SIZE;
> > +		printf("Attempting to recover info-block from end
> > offset %#llx\n", offset);
> > +		rc = btt_read_info(bttc, &btt_sb[1], offset);
> > +		if (rc)
> > +			goto out;
> 
> How about we verify the info block before blindlly copying in the
> data?
> I know you verify the result, but it's weird.

Fair point, and should be easy to do.

> 
> > +		/* copy over the arena0 specific fields from
> > btt_sb[0] */
> > +		btt_sb[1].flags = btt_sb[0].flags;
> > +		btt_sb[1].external_nlba = btt_sb[0].external_nlba;
> > +		btt_sb[1].internal_nlba = btt_sb[0].internal_nlba;
> > +		btt_sb[1].nextoff = btt_sb[0].nextoff;
> > +		btt_sb[1].dataoff = btt_sb[0].dataoff;
> > +		btt_sb[1].mapoff = btt_sb[0].mapoff;
> > +		btt_sb[1].logoff = btt_sb[0].logoff;
> > +		btt_sb[1].info2off = btt_sb[0].info2off;
> > +		btt_sb[1].checksum = btt_sb[0].checksum;
> > +		rc = btt_info_verify(bttc, &btt_sb[1]);
> > +		if (rc == 0) {
> > +			rc = btt_write_info(bttc, &btt_sb[1], 0);
> > +			goto out;
> > +		}
> > +	}
> > +
> > +	/*
> > +	 * Attempt 3: use info2off as-is, and check if we find a
> > valid info
> > +	 * block at that location.
> > +	 */
> > +	 offset = le32toh(btt_sb[0].info2off);
> > +	 if (offset) {
> 
> Maybe do some bounds checking on offset?

If the offset is bogus either the read will fail or the verify will, but
yes I could add checking too.

> 
> > +		printf("Attempting to recover info-block from info2
> > offset (as-is) %#llx\n", offset);
> > +		rc = btt_read_info(bttc, &btt_sb[1], offset);
> > +		if (rc)
> > +			goto out;
> > +		rc = btt_info_verify(bttc, &btt_sb[1]);
> > +		if (rc == 0) {
> > +			rc = btt_write_info(bttc, &btt_sb[1], 0);
> > +			goto out;
> > +		}
> > +	} else
> > +		rc = -ENXIO;
> 
> You could also look for the primary info blocks for the other arenas.

That seems kind of difficult as we don't really have any information on
where the other arenas might be.
We could do a full calculation of "this is where I would lay out the
arenas if I had this rawsize", but I was hoping to avoid doing that..

> 
> > + out:
> > +	free(btt_sb);
> > +	return rc;
> > +}
> > +
> > +static int namespace_check(struct ndctl_region *region,
> > +		struct ndctl_namespace *ndns)
> > +{
> 
> namespace_check sounds generic, but it really only works for the btt.
> Also, should we verify that the namespace was configured with a btt
> before even attempting to check it?

True. I did think of checking if this is a btt namespace before
checking, but I think if the namespace is disabled, my understanding is
ndctl and libndctl have no idea whether it is a btt namespace or not.
The BTT is discovered by the kernel when doing it's probe, and once it
is up, ndctl can see that, but until then, it can really only find out
if there is a valid BTT by trying to read the metadata itself.

If there is a future, different metadata format, I'd imagine its
checking would also begin here, but branch off based on which format was
found.

> 
> > +	raw_mode = ndctl_namespace_get_raw_mode(ndns);
> > +	ndctl_namespace_set_raw_mode(ndns, 1);
> > +	rc = ndctl_namespace_enable(ndns);
> > +	if (rc != 0)
> > +		error("%s: failed to enable raw mode\n", devname);
> > +	ndctl_namespace_set_raw_mode(ndns, raw_mode);
> 
> What the heck do the above 6 lines do?  Please explain.

Linda had the same question so I suppose I should at least have a
comment there :)
But the idea is we get the existing mode (raw or not), and save it in
raw_mode. Then try to enable it with raw_mode = 1. If it fails, restore
the saved raw_mode.

Maybe this is a bit pointless? But we should try and keep things in the
same state the user had, right? :)

> 
> Thanks!
> Jeff


Thanks for the review, I'll start working on the changes and send out a
v2 next week!
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ndctl: add a BTT check utility
  2017-01-20 20:04 ` Dan Williams
@ 2017-01-23 17:42   ` Ross Zwisler
  2017-01-23 17:49     ` Dan Williams
  0 siblings, 1 reply; 11+ messages in thread
From: Ross Zwisler @ 2017-01-23 17:42 UTC (permalink / raw)
  To: Dan Williams; +Cc: linux-nvdimm

On Fri, Jan 20, 2017 at 12:04:46PM -0800, Dan Williams wrote:
> [ adding Jeff ]
> 
> Some comments on boilerplate / organization items...
> 
> 
> On Thu, Jan 19, 2017 at 7:12 PM, Vishal Verma <vishal.l.verma@intel.com> wrote:
> > Add the check-namespace command to ndctl. This will check the BTT
> > metadata layout for the given namespace, and if requested, correct any
> > errors found. Not all metadata corruption is detectable or fixable.
> >
> > Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
<>
> > +static int btt_read_info(struct btt_chk *bttc, struct btt_sb *btt_sb, __u64 off)
> > +{
> > +       int fd, rc = 0;
> > +       ssize_t size;
> > +
> > +       fd = open(bttc->path, O_RDONLY);
> 
> Let's not open and close the btt device on each call. Open it once at
> the beginning of time and open it O_EXCL so that we prevent the kernel
> trying to claim the device. Let's also use O_DIRECT to bypass the
> block_device page cache.

I thought O_EXCL only mattered if it was used for creation of new files with
O_CREAT?

https://linux.die.net/man/3/open

For us bttc->path is our device in /dev/, so that O_EXCL shouldn't do anything
for us?
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ndctl: add a BTT check utility
  2017-01-23 17:42   ` Ross Zwisler
@ 2017-01-23 17:49     ` Dan Williams
  0 siblings, 0 replies; 11+ messages in thread
From: Dan Williams @ 2017-01-23 17:49 UTC (permalink / raw)
  To: Ross Zwisler; +Cc: linux-nvdimm

On Mon, Jan 23, 2017 at 9:42 AM, Ross Zwisler
<ross.zwisler@linux.intel.com> wrote:
> On Fri, Jan 20, 2017 at 12:04:46PM -0800, Dan Williams wrote:
>> [ adding Jeff ]
>>
>> Some comments on boilerplate / organization items...
>>
>>
>> On Thu, Jan 19, 2017 at 7:12 PM, Vishal Verma <vishal.l.verma@intel.com> wrote:
>> > Add the check-namespace command to ndctl. This will check the BTT
>> > metadata layout for the given namespace, and if requested, correct any
>> > errors found. Not all metadata corruption is detectable or fixable.
>> >
>> > Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
> <>
>> > +static int btt_read_info(struct btt_chk *bttc, struct btt_sb *btt_sb, __u64 off)
>> > +{
>> > +       int fd, rc = 0;
>> > +       ssize_t size;
>> > +
>> > +       fd = open(bttc->path, O_RDONLY);
>>
>> Let's not open and close the btt device on each call. Open it once at
>> the beginning of time and open it O_EXCL so that we prevent the kernel
>> trying to claim the device. Let's also use O_DIRECT to bypass the
>> block_device page cache.
>
> I thought O_EXCL only mattered if it was used for creation of new files with
> O_CREAT?
>
> https://linux.die.net/man/3/open
>
> For us bttc->path is our device in /dev/, so that O_EXCL shouldn't do anything
> for us?

You overlooked this bit in man 2 open:

"In general, the behavior of  O_EXCL  is  undefined  if  it  is  used
without O_CREAT.   There is one exception: on Linux 2.6 and later,
O_EXCL can be used without O_CREAT if pathname refers to a block
device.  If the block device is in use by the system (e.g., mounted),
open() fails with the error EBUSY."
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ndctl: add a BTT check utility
       [not found]         ` <1484959224.4857.6.camel-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
@ 2017-01-23 18:20           ` Jeff Moyer
       [not found]             ` <x49vat5ljtz.fsf-RRHT56Q3PSP4kTEheFKJxxDDeQx5vsVwAInAS/Ez/D0@public.gmane.org>
  0 siblings, 1 reply; 11+ messages in thread
From: Jeff Moyer @ 2017-01-23 18:20 UTC (permalink / raw)
  To: Verma, Vishal L; +Cc: linux-nvdimm-y27Ovi1pjclAfugRpC6u6w

"Verma, Vishal L" <vishal.l.verma@intel.com> writes:

> On Fri, 2017-01-20 at 18:14 -0500, Jeff Moyer wrote:

>> How was this tested?  Did you perturb the btt in ways that you expect
>> can and can't be fixed?  Can you add tests for that?
>
> Yes, so for some things I tested by essentially hand editing the
> metadata. But for other things - mainly multi-arena testing, and the
> different recovery attempts for first_sb, I modified the kernel btt to
> produce small 8MB arenas, and did the same in the checker. For the
> former, those can be 'formalized' as ndctl unit tests, and I will start
> doing that. But for the latter, I don't know if there's an easy way to
> get full coverage on those cases.

OK, yeah, let's definitely put tests in the tree for anything that is
easy to do.  Can't we create test nfits large enough for multiple
arenas?  One thing we could do is keep around bogus metadata images and
just shove them into the raw device.  Only if that's easier than
programmatically perturbuing things.

Also, we may consider a btt metadata dump utility, much like file
systems have.

>> Checking should not be an optional feature.  Just always build it in.
>
> You mean checking metadata when bringing up the namespace? The kernel
> does essential checks of course, but do you mean the more involved ones
> like log/map entry bounds, the backup info block etc?

I mean't the "check" ndctl command shouldn't be selectable via
configure.  I think Dan had the same comment.

>> Why should a user have to disable a namespace in order to run the
>> checker?  I guess that's the analog of unmounting an fs?  Still, seems
>> like we could just do that for the user.
>
> Mostly so that the user can be sure nothing else is using it when the
> check starts? We could disable it for them, but I just thought this
> would be safer? An option could be providing a --force option like I
> think destroy or reconfig do..

I think that's a good compromise.

>> Can this code in any way be shared with libpmemblk?  They have their
>> own
>> checker (which does different things), and it seems like a lost
>> opportunity to not be able to leverage the same code in both places.
>
> We started out with that, in fact with depending on libpmemblk's checker
> and calling into their library, but it was odd for ndctl (a core system
> library of sorts) to depend on libpmemblk (an 'application' library).
> Then we considered moving the libpmemblk checker parts into libndctl,
> and making pmemblk depend on us, but they also used their library on
> windows, so we couldn't just move it. Finally we just decided to have a
> copy of the code in ndctl, so I rewrote it in a ndctl native way.

OK.

>> There seems to be a lot of repeated code sequences around reading and
>> verifying the btt info blocks.  Perhaps that can be factored
>> out?  Maybe
>> get rid of the first_sb special snowflake and roll that into the arena
>> discovery?

You didn't comment on this one... did you miss it, or will you
investigate, or do you think it's a bogus comment?

>> > diff --git a/ndctl/btt-structs.h b/ndctl/btt-structs.h
>> > +#define IB_FLAG_ERROR 0x00000001
>> > +#define IB_FLAG_ERROR_MASK 0x00000001
>> 
>> IB_FLAG_ERROR is defined but never used/checked.  Of course, neither
>> the
>> kernel nor this utility will set that flag.  We should probably decide
>> whether it makes sense to do so.
>
> True, will remove.

Well, hold on.  This flag exists in the spec, so at the very least we
should consider where it should be set, and what to do if it /is/ set.
Let's not just remove it.

>> I find it odd that the caller has to pass in the offset 4k before the
>> start of the info block.  It seems natural for the first info block,
>> but
>> for the backup, it's totally bogus.  ;-)
>
> I agree, and I hated it too :)
> But I wanted to keep the +4K addition in the lowest level functions..
> Have a better idea? :)  This does kinda kill the idea of "oh the 4k
> offsetting will be completely hidden from the higher level logic"
>
> The opposite of this would be sprinkling a number of +4K everywhere
> else, and I think with this we only have to do the inverse in a couple
> of places..

I guess I'd have to see the result of inverting the roles.  Feel free to
hold off on this one, I don't think it's critical.

>> Again, z and e are unused.  Nuke 'em.
>
> True, I just copied these over from the kernel implementation, and
> wasn't sure if I'd ever be using z and e. Currently the kernel doesn't
> even set those, so it is a bit redundant. If the kernel does grow the
> ability to use them in future, there could be use for these, but I
> suppose I should remove the dead code until that happens :)

Every single state of those two bits is valid.  How would you ever do
something useful with them?

>> Should we perform a load of the info block after the msync to ensure
>> it
>> was written (to trigger an MCE if one is going to happen)?  That same
>> question could be asked for each place where an update is made.
>
> Good question. I guess the probability of those locations having a bad
> cell is low because we did just read them to find out there is a
> problem, and subsequently writing them should have a low probability of
> triggering a CMCI. But re-reading to make sure it was correctly written
> is not a bad idea. Dan, thoughts?

This one's not critical.  But it's a checker tool, so it might be nice
to be paranoid.

>> This isn't an error.  We expect this to happen if power is suddenly
>> lost, and the format is built to recover from it.  If you log it at
>> all, I would make it an informational message.
>> 
>> I wonder whether we should even fix it.  I worry about having the same
>> recovery code in multiple places (the kernel and here).  What do you
>> think?
>
> Hm, I agree about the message, in that it is routine. But I added the
> repairing so that a user may just decide to do a check using this after
> a power failure, and we can easily fix it when we have the chance. I do
> agree about the check/correction in multiple places, I wouldn't be too
> opposed to removing it entirely from here.

If you remove it, put a big comment in there.  :)  I'll leave the
ultimate decision up to you.  It's not complex recovery, so we *should*
be okay to have that logic in multiple places...

>> > +	/*
>> > +	 * Attempt 3: use info2off as-is, and check if we find a
>> > valid info
>> > +	 * block at that location.
>> > +	 */
>> > +	 offset = le32toh(btt_sb[0].info2off);
>> > +	 if (offset) {
>> 
>> Maybe do some bounds checking on offset?
>
> If the offset is bogus either the read will fail or the verify will, but
> yes I could add checking too.

Again, not critical.  I'll leave it up to you.

>> > +		printf("Attempting to recover info-block from info2
>> > offset (as-is) %#llx\n", offset);
>> > +		rc = btt_read_info(bttc, &btt_sb[1], offset);
>> > +		if (rc)
>> > +			goto out;
>> > +		rc = btt_info_verify(bttc, &btt_sb[1]);
>> > +		if (rc == 0) {
>> > +			rc = btt_write_info(bttc, &btt_sb[1], 0);
>> > +			goto out;
>> > +		}
>> > +	} else
>> > +		rc = -ENXIO;
>> 
>> You could also look for the primary info blocks for the other arenas.
>
> That seems kind of difficult as we don't really have any information on
> where the other arenas might be.
> We could do a full calculation of "this is where I would lay out the
> arenas if I had this rawsize", but I was hoping to avoid doing that..

OK.  If we get to here, things are pretty well screwed.  Feel free to
ignore that suggestion.

>> > + out:
>> > +	free(btt_sb);
>> > +	return rc;
>> > +}
>> > +
>> > +static int namespace_check(struct ndctl_region *region,
>> > +		struct ndctl_namespace *ndns)
>> > +{
>> 
>> namespace_check sounds generic, but it really only works for the btt.
>> Also, should we verify that the namespace was configured with a btt
>> before even attempting to check it?
>
> True. I did think of checking if this is a btt namespace before
> checking, but I think if the namespace is disabled, my understanding is
> ndctl and libndctl have no idea whether it is a btt namespace or not.
> The BTT is discovered by the kernel when doing it's probe, and once it
> is up, ndctl can see that, but until then, it can really only find out
> if there is a valid BTT by trying to read the metadata itself.

Yeah.  At some point I guess we have to trust the admin not to try to
fix the btt on a non-btt device.  However, if we allowed them to run the
command on a pmemNs device, that would reduce the opportunities for
failure.

>> > +	raw_mode = ndctl_namespace_get_raw_mode(ndns);
>> > +	ndctl_namespace_set_raw_mode(ndns, 1);
>> > +	rc = ndctl_namespace_enable(ndns);
>> > +	if (rc != 0)
>> > +		error("%s: failed to enable raw mode\n", devname);
>> > +	ndctl_namespace_set_raw_mode(ndns, raw_mode);
>> 
>> What the heck do the above 6 lines do?  Please explain.
>
> Linda had the same question so I suppose I should at least have a
> comment there :)
> But the idea is we get the existing mode (raw or not), and save it in
> raw_mode. Then try to enable it with raw_mode = 1. If it fails, restore
> the saved raw_mode.

I had more basic questions, like what the heck is going on?  :-) I've
done some more reading, and this is what I think is happening.

	/* In typical usage, the current raw_mode should be false. */
	raw_mode = ndctl_namespace_get_raw_mode(ndns);
	/*
	 * Putting the namespace into raw mode will allow us to access
	 * the btt metadata.
	 */
	ndctl_namespace_set_raw_mode(ndns, 1);
	/*
	 * Now enable the namespace.  This will result in a pmem device
	 * node showing up in /dev that is in raw mode.
	 */
	rc = ndctl_namespace_enable(ndns);
	if (rc != 0)
		error("%s: failed to enable raw mode\n", devname);
	sprintf(path, "/dev/%s", ndctl_namespace_get_block_device(ndns));

And move the resetting of raw_mode down into the out: label.  That will
go a long way towards clearing up the confusion I had when reading the
code for the first time.  I think you should also mention that a
pre-condition for this function is that the namespace is disabled, and
you should check for it and provide a useful error message if it's
already enabled.

> Maybe this is a bit pointless? But we should try and keep things in the
> same state the user had, right? :)

I definitely agree we should keep things in the same state.  I really
didn't have a good grasp of what was going on there, and comments would
have saved me some time digging through both ndctl and kernel code.

Cheers,
Jeff
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ndctl: add a BTT check utility
       [not found]             ` <x49vat5ljtz.fsf-RRHT56Q3PSP4kTEheFKJxxDDeQx5vsVwAInAS/Ez/D0@public.gmane.org>
@ 2017-01-24 20:59               ` Verma, Vishal L
  0 siblings, 0 replies; 11+ messages in thread
From: Verma, Vishal L @ 2017-01-24 20:59 UTC (permalink / raw)
  To: jmoyer-H+wXaHxf7aLQT0dZR+AlfA; +Cc: linux-nvdimm-y27Ovi1pjclAfugRpC6u6w

On Mon, 2017-01-23 at 13:20 -0500, Jeff Moyer wrote:
> 
> 
> OK, yeah, let's definitely put tests in the tree for anything that is
> easy to do.  Can't we create test nfits large enough for multiple
> arenas?  One thing we could do is keep around bogus metadata images
> and
> just shove them into the raw device.  Only if that's easier than
> programmatically perturbuing things.
> 
> Also, we may consider a btt metadata dump utility, much like file
> systems have.1

Yep I will work on tests. No, the ARENA_MAX_SIZE is 512G, so the kernel
won't create a second arena unless the namespace is larger than that,
and nfit_test namespaces are only 32M or so..

Dan and I were also talking about a a metadata dump utility similar to
filesystems. I'll work on this next.

> 
> > > Checking should not be an optional feature.  Just always build it
> > > in.
> > 
> > You mean checking metadata when bringing up the namespace? The
> > kernel
> > does essential checks of course, but do you mean the more involved
> > ones
> > like log/map entry bounds, the backup info block etc?
> 
> I mean't the "check" ndctl command shouldn't be selectable via
> configure.  I think Dan had the same comment.

Ah yes I'll make that change.

> 
> > > There seems to be a lot of repeated code sequences around reading
> > > and
> > > verifying the btt info blocks.  Perhaps that can be factored
> > > out?  Maybe
> > > get rid of the first_sb special snowflake and roll that into the
> > > arena
> > > discovery?
> 
> You didn't comment on this one... did you miss it, or will you
> investigate, or do you think it's a bogus comment?

I just missed it. I did notice the read+verify sequence, maybe in the
least, I could wrap those two into a wrapper function that combines the
two operations.

I think the stuff we do in first_sb is required to find (recover) the
info block if one is not readily found. It is a special snowflake just
because it uses some heuristics to try and find a usable info block,
where as discover_arenas just walks through the namespace, and knows
what to expect at each step. I'll stare at it harder and see if I can
consolidate/refactor anything :)

> 
> > > > diff --git a/ndctl/btt-structs.h b/ndctl/btt-structs.h
> > > > +#define IB_FLAG_ERROR 0x00000001
> > > > +#define IB_FLAG_ERROR_MASK 0x00000001
> > > 
> > > IB_FLAG_ERROR is defined but never used/checked.  Of course,
> > > neither
> > > the
> > > kernel nor this utility will set that flag.  We should probably
> > > decide
> > > whether it makes sense to do so.
> > 
> > True, will remove.
> 
> Well, hold on.  This flag exists in the spec, so at the very least we
> should consider where it should be set, and what to do if it /is/ set.
> Let's not just remove it.

I said that because we know currently in the kernel we have no way of
setting it. I think the first thing we'd need to do is figure out what
cases it will be set under.. And then figure out what the checker can do
about it.

> 
> > > I find it odd that the caller has to pass in the offset 4k before
> > > the
> > > start of the info block.  It seems natural for the first info
> > > block,
> > > but
> > > for the backup, it's totally bogus.  ;-)
> > 
> > I agree, and I hated it too :)
> > But I wanted to keep the +4K addition in the lowest level
> > functions..
> > Have a better idea? :)  This does kinda kill the idea of "oh the 4k
> > offsetting will be completely hidden from the higher level logic"
> > 
> > The opposite of this would be sprinkling a number of +4K everywhere
> > else, and I think with this we only have to do the inverse in a
> > couple
> > of places..
> 
> I guess I'd have to see the result of inverting the roles.  Feel free
> to
> hold off on this one, I don't think it's critical.
> 
> > > Again, z and e are unused.  Nuke 'em.
> > 
> > True, I just copied these over from the kernel implementation, and
> > wasn't sure if I'd ever be using z and e. Currently the kernel
> > doesn't
> > even set those, so it is a bit redundant. If the kernel does grow
> > the
> > ability to use them in future, there could be use for these, but I
> > suppose I should remove the dead code until that happens :)
> 
> Every single state of those two bits is valid.  How would you ever do
> something useful with them?

Agreed, I don't see a scenario where the checker would attempt to modify
the bits.
> 
> > > Should we perform a load of the info block after the msync to
> > > ensure
> > > it
> > > was written (to trigger an MCE if one is going to happen)?  That
> > > same
> > > question could be asked for each place where an update is made.
> > 
> > Good question. I guess the probability of those locations having a
> > bad
> > cell is low because we did just read them to find out there is a
> > problem, and subsequently writing them should have a low probability
> > of
> > triggering a CMCI. But re-reading to make sure it was correctly
> > written
> > is not a bad idea. Dan, thoughts?
> 
> This one's not critical.  But it's a checker tool, so it might be nice
> to be paranoid.
> 
> > > This isn't an error.  We expect this to happen if power is
> > > suddenly
> > > lost, and the format is built to recover from it.  If you log it
> > > at
> > > all, I would make it an informational message.
> > > 
> > > I wonder whether we should even fix it.  I worry about having the
> > > same
> > > recovery code in multiple places (the kernel and here).  What do
> > > you
> > > think?
> > 
> > Hm, I agree about the message, in that it is routine. But I added
> > the
> > repairing so that a user may just decide to do a check using this
> > after
> > a power failure, and we can easily fix it when we have the chance. I
> > do
> > agree about the check/correction in multiple places, I wouldn't be
> > too
> > opposed to removing it entirely from here.
> 
> If you remove it, put a big comment in there.  :)  I'll leave the
> ultimate decision up to you.  It's not complex recovery, so we
> *should*
> be okay to have that logic in multiple places...

I can remove it, this makes the entire function (btt_check_log_map)
disappear, and I'll put a comment in the main checker loop saying that
this is a normal condition.

> 
> > > > +	/*
> > > > +	 * Attempt 3: use info2off as-is, and check if we find
> > > > a
> > > > valid info
> > > > +	 * block at that location.
> > > > +	 */
> > > > +	 offset = le32toh(btt_sb[0].info2off);
> > > > +	 if (offset) {
> > > 
> > > Maybe do some bounds checking on offset?
> > 
> > If the offset is bogus either the read will fail or the verify will,
> > but
> > yes I could add checking too.
> 
> Again, not critical.  I'll leave it up to you.
> 
> > > > +		printf("Attempting to recover info-block from
> > > > info2
> > > > offset (as-is) %#llx\n", offset);
> > > > +		rc = btt_read_info(bttc, &btt_sb[1], offset);
> > > > +		if (rc)
> > > > +			goto out;
> > > > +		rc = btt_info_verify(bttc, &btt_sb[1]);
> > > > +		if (rc == 0) {
> > > > +			rc = btt_write_info(bttc, &btt_sb[1],
> > > > 0);
> > > > +			goto out;
> > > > +		}
> > > > +	} else
> > > > +		rc = -ENXIO;
> > > 
> > > You could also look for the primary info blocks for the other
> > > arenas.
> > 
> > That seems kind of difficult as we don't really have any information
> > on
> > where the other arenas might be.
> > We could do a full calculation of "this is where I would lay out the
> > arenas if I had this rawsize", but I was hoping to avoid doing
> > that..
> 
> OK.  If we get to here, things are pretty well screwed.  Feel free to
> ignore that suggestion.
> 
> > > > + out:
> > > > +	free(btt_sb);
> > > > +	return rc;
> > > > +}
> > > > +
> > > > +static int namespace_check(struct ndctl_region *region,
> > > > +		struct ndctl_namespace *ndns)
> > > > +{
> > > 
> > > namespace_check sounds generic, but it really only works for the
> > > btt.
> > > Also, should we verify that the namespace was configured with a
> > > btt
> > > before even attempting to check it?
> > 
> > True. I did think of checking if this is a btt namespace before
> > checking, but I think if the namespace is disabled, my understanding
> > is
> > ndctl and libndctl have no idea whether it is a btt namespace or
> > not.
> > The BTT is discovered by the kernel when doing it's probe, and once
> > it
> > is up, ndctl can see that, but until then, it can really only find
> > out
> > if there is a valid BTT by trying to read the metadata itself.
> 
> Yeah.  At some point I guess we have to trust the admin not to try to
> fix the btt on a non-btt device.  However, if we allowed them to run
> the
> command on a pmemNs device, that would reduce the opportunities for
> failure.

I'm liking the --force option for this (running on a 'live' namespace),
and even if someone does run it on a non-BTT namespace, we fail quickly
enough with a helpful message..

> 
> > > > +	raw_mode = ndctl_namespace_get_raw_mode(ndns);
> > > > +	ndctl_namespace_set_raw_mode(ndns, 1);
> > > > +	rc = ndctl_namespace_enable(ndns);
> > > > +	if (rc != 0)
> > > > +		error("%s: failed to enable raw mode\n",
> > > > devname);
> > > > +	ndctl_namespace_set_raw_mode(ndns, raw_mode);
> > > 
> > > What the heck do the above 6 lines do?  Please explain.
> > 
> > Linda had the same question so I suppose I should at least have a
> > comment there :)
> > But the idea is we get the existing mode (raw or not), and save it
> > in
> > raw_mode. Then try to enable it with raw_mode = 1. If it fails,
> > restore
> > the saved raw_mode.
> 
> I had more basic questions, like what the heck is going on?  :-) I've
> done some more reading, and this is what I think is happening.
> 
> 	/* In typical usage, the current raw_mode should be false. */
> 	raw_mode = ndctl_namespace_get_raw_mode(ndns);
> 	/*
> 	 * Putting the namespace into raw mode will allow us to access
> 	 * the btt metadata.
> 	 */
> 	ndctl_namespace_set_raw_mode(ndns, 1);
> 	/*
> 	 * Now enable the namespace.  This will result in a pmem device
> 	 * node showing up in /dev that is in raw mode.
> 	 */
> 	rc = ndctl_namespace_enable(ndns);
> 	if (rc != 0)
> 		error("%s: failed to enable raw mode\n", devname);
> 	sprintf(path, "/dev/%s",
> ndctl_namespace_get_block_device(ndns));
> 
> And move the resetting of raw_mode down into the out: label.  That
> will
> go a long way towards clearing up the confusion I had when reading the
> code for the first time.  I think you should also mention that a
> pre-condition for this function is that the namespace is disabled, and
> you should check for it and provide a useful error message if it's
> already enabled.

Yes your interpretation is correct, and I will steal some of the above
comments and put them in :)

> 
> > Maybe this is a bit pointless? But we should try and keep things in
> > the
> > same state the user had, right? :)
> 
> I definitely agree we should keep things in the same state.  I really
> didn't have a good grasp of what was going on there, and comments
> would
> have saved me some time digging through both ndctl and kernel code.


> 
> Cheers,
> Jeff
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ndctl: add a BTT check utility
  2017-01-20 21:00 ` Linda Knippers
@ 2017-01-24 22:58   ` Verma, Vishal L
  0 siblings, 0 replies; 11+ messages in thread
From: Verma, Vishal L @ 2017-01-24 22:58 UTC (permalink / raw)
  To: linda.knippers, linux-nvdimm

On Fri, 2017-01-20 at 16:00 -0500, Linda Knippers wrote:
> Hi Vishal,
> 
> It's nice to see a BTT check utility.
> 
> I have a few high level questions/comments below.

Hi Linda - thanks! See below -

> 
> -- ljk
> 
> On 01/19/2017 10:12 PM, Vishal Verma wrote:
> > 
> > +
> > +EXAMPLES
> > +--------
> > +
> > +Check a BTT namespace
> > +[verse]
> > +ndctl disable-namespace namespace0.0
> > +ndctl check-namespace namespace0.0
> 
> If the namespace has to be disabled to be checked, it's probably worth
> saying in
> the text as well as in the example.

Yes, updating.

> 
> > +
> > +Check a BTT namespace, but don't actually restore/correct anything
> > +[verse]
> > +ndctl check-namespace --dry-run
> 
> Does the namespace not have to be disabled for the --dry-run option?

It does, and I'm reversing the logic, converting --dry-run to --repair
according to Dan's suggestion, but I'll include the disable-namespace in
both examples.

> 
> 
> If we only know how to check namespaces that are in sector mode,
> should you check up front and then bail out with an appropriate
> message
> if the namespace isn't in sector mode?  Otherwise, it just seems
> like we fail later with a message that isn't very helpful.

Like I said in a reply to Jeff, it isn't really possible for ndctl to
tell if it was supposed to be in sector mode, only the kernel can do
that during the probe. I think the best we can do is notify that a BTT
was not found on the namespace at all (which is I believe what we do
here).

> 
> > +
> > +	printf("checking %s\n", devname);
> > +	if (!is_namespace_offline(ndns)) {
> > +		error("%s: check aborted, namespace online\n",
> > devname);
> > +		return -EBUSY;
> > +	}
> > +
> > +	raw_mode = ndctl_namespace_get_raw_mode(ndns);
> > +	ndctl_namespace_set_raw_mode(ndns, 1);
> > +	rc = ndctl_namespace_enable(ndns);
> > +	if (rc != 0)
> > +		error("%s: failed to enable raw mode\n", devname);
> 
> The namespace is now enabled in raw mode.  How do we prevent something
> else from seeing the device and opening it?  You open it later but not
> in exclusive mode, and it gets opened and closed a few times leaving
> windows where something else could open it.  Or is something else
> protecting the device?

Good point, like Dan also suggested, I'll change this to doing one
O_EXCL open at the start and using that fd for all subsequent
operations.

> 
> > +	ndctl_namespace_set_raw_mode(ndns, raw_mode);
> 
> What is this second call doing?

See the conversation with Jeff regarding these - we're just saving the
raw_mode, setting it to '1', and restoring raw_mode on failure. Like
Jeff suggested, I'll move the restore to the 'out' label.

> 
> > +	sprintf(path, "/dev/%s",
> > ndctl_namespace_get_block_device(ndns));
> > +
> > +	btt_sb = malloc(sizeof(*btt_sb));
> > +	if (btt_sb == NULL) {
> > +		rc = -ENOMEM;
> > +		goto out;
> > +	}
> > +
> > +	bttc = calloc(1, sizeof(*bttc));
> > +	if (bttc == NULL) {
> > +		rc = -ENOMEM;
> > +		goto out_sb;
> > +	}
> > +	bttc->path = path;
> > +	bttc->rawsize = ndctl_namespace_get_size(ndns);
> > +	ndctl_namespace_get_uuid(ndns, bttc->parent_uuid);
> > +
> > +	rc = btt_read_info(bttc, btt_sb, 0);
> > +	if (rc)
> > +		goto out_bttc;
> > +	rc = btt_info_verify(bttc, btt_sb);
> > +	if (rc) {
> > +		rc = btt_recover_first_sb(bttc);
> > +		if (rc) {
> > +			error("Unable to recover any BTT info
> > blocks, aborting\n");
> > +			goto out_bttc;
> > +		}
> > +		rc = btt_read_info(bttc, btt_sb, 0);
> > +		if (rc)
> > +			goto out_bttc;
> > +	}
> > +	rc = btt_discover_arenas(bttc);
> > +	if (rc)
> > +		goto out_bttc;
> > +
> > +	rc = btt_create_mappings(bttc);
> > +	if (rc)
> > +		goto out_bttc;
> > +
> > +	rc = btt_check_arenas(bttc);
> > +
> > + out_bttc:
> > +	btt_remove_mappings(bttc);
> > +	free(bttc);
> > + out_sb:
> > +	free(btt_sb);
> > + out:
> > +	ndctl_namespace_disable(ndns);
> 
> Since you put the namespace into raw mode, do you have to put it back
> into sector mode?

Yep will fix.
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ndctl: add a BTT check utility
       [not found]     ` <x49lgu52uk0.fsf-RRHT56Q3PSP4kTEheFKJxxDDeQx5vsVwAInAS/Ez/D0@public.gmane.org>
  2017-01-21  0:41       ` Verma, Vishal L
@ 2017-01-31  0:12       ` Verma, Vishal L
  1 sibling, 0 replies; 11+ messages in thread
From: Verma, Vishal L @ 2017-01-31  0:12 UTC (permalink / raw)
  To: jmoyer-H+wXaHxf7aLQT0dZR+AlfA; +Cc: linux-nvdimm-y27Ovi1pjclAfugRpC6u6w

On Fri, 2017-01-20 at 18:14 -0500, Jeff Moyer wrote:

<snip>
> 
> > +static int btt_map_read(struct arena_info *a, __u32 lba, __u32
> > *mapping,
> > +			int *trim, int *error)
> > +{
> > +	__u32 raw_mapping, postmap, ze, z_flag, e_flag;
> > +
> > +	raw_mapping = le32toh(a->map.map[lba]);
> > +	z_flag = (raw_mapping & MAP_TRIM_MASK) >> MAP_TRIM_SHIFT;
> > +	e_flag = (raw_mapping & MAP_ERR_MASK) >> MAP_ERR_SHIFT;
> > +	ze = (z_flag << 1) + e_flag;
> > +	postmap = raw_mapping & MAP_LBA_MASK;
> > +
> > +	/* Reuse the {z,e}_flag variables for *trim and *error */
> > +	z_flag = 0;
> > +	e_flag = 0;
> > +
> > +	switch (ze) {
> > +	case 0:
> > +		/* Initial state. Return postmap = premap */
> > +		*mapping = lba;
> > +		break;
> > +	case 1:
> > +		*mapping = postmap;
> > +		e_flag = 1;
> > +		break;
> > +	case 2:
> > +		*mapping = postmap;
> > +		z_flag = 1;
> > +		break;
> > +	case 3:
> > +		*mapping = postmap;
> > +		break;
> > +	default:
> > +		return -EIO;
> > +	}
> > +
> > +	if (trim)
> > +		*trim = z_flag;
> > +	if (error)
> > +		*error = e_flag;
> > +
> > +	return 0;
> > +}
> 
> No caller uses z or e, so get rid of those args.  Also, your default
> entry in the switch statement can never be hit.  Fixing those, the
> function is just:
> 
> /*
>  * btt_map_lookup
>  *
>  * Given a pre-map Arena Block Address, return the post-map ABA.
>  */
> static u32 btt_map_lookup(struct arena_info *a, __u32 lba)
> {
>         return (le32toh(a->map.map[lba]) & MAP_LBA_MASK);

I think this might be an oversimplification?
We still have to check whether the z and e flags are both 0, and if so
return 'lba' -- the initial state check. This would return '0' which
would be incorrect.. I will simplify the arguments etc though, and not
return the values of flags.

> }
> 
> You can take or leave the function name change.  Please keep the
> comment, and maybe kernel-doc-ify it.
> 
> > +static int btt_map_write(struct arena_info *a, __u32 lba, __u32
> > mapping,
> > +			__u32 z_flag, __u32 e_flag)
> > +{
> 
> Again, z and e are unused.  Nuke 'em.
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2017-01-31  0:12 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-01-20  3:12 [PATCH] ndctl: add a BTT check utility Vishal Verma
2017-01-20 20:04 ` Dan Williams
2017-01-23 17:42   ` Ross Zwisler
2017-01-23 17:49     ` Dan Williams
2017-01-20 21:00 ` Linda Knippers
2017-01-24 22:58   ` Verma, Vishal L
     [not found] ` <20170120031234.24226-1-vishal.l.verma-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2017-01-20 23:14   ` Jeff Moyer
     [not found]     ` <x49lgu52uk0.fsf-RRHT56Q3PSP4kTEheFKJxxDDeQx5vsVwAInAS/Ez/D0@public.gmane.org>
2017-01-21  0:41       ` Verma, Vishal L
     [not found]         ` <1484959224.4857.6.camel-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2017-01-23 18:20           ` Jeff Moyer
     [not found]             ` <x49vat5ljtz.fsf-RRHT56Q3PSP4kTEheFKJxxDDeQx5vsVwAInAS/Ez/D0@public.gmane.org>
2017-01-24 20:59               ` Verma, Vishal L
2017-01-31  0:12       ` Verma, Vishal L

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.