All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/5] [RFC] RAID-level terminology change
@ 2013-03-09 20:31 Hugo Mills
  2013-03-09 20:31 ` [PATCH 1/5] Use nCmSpP format for mkfs Hugo Mills
                   ` (10 more replies)
  0 siblings, 11 replies; 37+ messages in thread
From: Hugo Mills @ 2013-03-09 20:31 UTC (permalink / raw)
  To: linux-btrfs

   Some time ago, and occasionally since, we've discussed altering the
"RAID-n" terminology to change it to an "nCmSpP" format, where n is the
number of copies, m is the number of (data) devices in a stripe per copy,
and p is the number of parity devices in a stripe.

   The current kernel implementation uses as many devices as it can in the
striped modes (RAID-0, -10, -5, -6), and in this implementation, that is
written as "mS" (with a literal "m"). The mS and pP sections are omitted
if the value is 1S or 0P.

   The magic look-up table for old-style / new-style is:

single 		 1C (or omitted, in btrfs fi df output)
RAID-0		 1CmS
RAID-1		 2C
DUP			 2CD
RAID-10		 2CmS
RAID-5		 1CmS1P
RAID-6		 1CmS2P

   The following patch set modifies userspace tools to accept C/S/P formats
in input (mkfs and the restriper). The older formats are also accepted. It
also prints the newer formats by default in btrfs fi df, with an option to
show the older format for the traditionalists.

   I'm not sure whether we should omit the 1C in btrfs fi df output, or
make it explicit even in the "single" case.

   Hugo.

Hugo Mills (5):
  Use nCmSpP format for mkfs
  Move parse_profile to utils.c
  Convert balance filter parser to use common nCmSpP replication-level
    parser
  Change output of btrfs fi df to report new (or old) RAID names
  Add man page description for nCmSpP replication levels

 cmds-balance.c      |   23 +++------
 cmds-filesystem.c   |  135 +++++++++++++++++++++++++++++++++++++++++++--------
 man/btrfs.8.in      |    9 ++++
 man/mkfs.btrfs.8.in |   24 ++++++++-
 mkfs.c              |   35 ++++---------
 utils.c             |   94 +++++++++++++++++++++++++++++++++++
 utils.h             |    1 +
 7 files changed, 258 insertions(+), 63 deletions(-)

-- 
1.7.10.4


^ permalink raw reply	[flat|nested] 37+ messages in thread

* [PATCH 1/5] Use nCmSpP format for mkfs
  2013-03-09 20:31 [PATCH 0/5] [RFC] RAID-level terminology change Hugo Mills
@ 2013-03-09 20:31 ` Hugo Mills
  2013-03-09 20:31 ` [PATCH 2/5] Move parse_profile to utils.c Hugo Mills
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 37+ messages in thread
From: Hugo Mills @ 2013-03-09 20:31 UTC (permalink / raw)
  To: linux-btrfs

Teach mkfs.btrfs about nCmSpP format for replication levels, which avoids
the semantic uncertainty over the "RAID-XYZ" naming.

Signed-off-by: Hugo Mills <hugo@carfax.org.uk>
---
 mkfs.c |   91 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 84 insertions(+), 7 deletions(-)

diff --git a/mkfs.c b/mkfs.c
index 5ece186..02a7086 100644
--- a/mkfs.c
+++ b/mkfs.c
@@ -326,7 +326,9 @@ static void print_usage(void)
 	fprintf(stderr, "options:\n");
 	fprintf(stderr, "\t -A --alloc-start the offset to start the FS\n");
 	fprintf(stderr, "\t -b --byte-count total number of bytes in the FS\n");
-	fprintf(stderr, "\t -d --data data profile, raid0, raid1, raid10, dup or single\n");
+	fprintf(stderr, "\t -d --data data profile: <n>C[<m>S[<p>P]]\n");
+	fprintf(stderr, "\t\tfor n copies, m stripes, p parity stripes\n");
+	fprintf(stderr, "\t\tor raid0, raid1, raid10, dup or single (deprecated)\n");
 	fprintf(stderr, "\t -l --leafsize size of btree leaves\n");
 	fprintf(stderr, "\t -L --label set a label\n");
 	fprintf(stderr, "\t -m --metadata metadata profile, values like data profile\n");
@@ -346,8 +348,36 @@ static void print_version(void)
 	exit(0);
 }
 
-static u64 parse_profile(char *s)
+static u64 make_profile(int copies, int dup, int stripes, int parity)
 {
+	if(copies == 1 && !dup && stripes == 0 && parity == 0)
+		return 0;
+	else if(copies == 2 && dup && stripes == 0 && parity == 0)
+		return BTRFS_BLOCK_GROUP_DUP;
+	else if(copies == 2 && !dup && stripes == 0 && parity == 0)
+		return BTRFS_BLOCK_GROUP_RAID1;
+	else if(copies == 2 && !dup && stripes == -1 && parity == 0)
+		return BTRFS_BLOCK_GROUP_RAID10;
+	else if(copies == 1 && !dup && stripes == -1 && parity == 0)
+		return BTRFS_BLOCK_GROUP_RAID0;
+	else if(copies == 1 && !dup && stripes == -1 && parity == 1)
+		return BTRFS_BLOCK_GROUP_RAID5;
+	else if(copies == 1 && !dup && stripes == -1 && parity == 2)
+		return BTRFS_BLOCK_GROUP_RAID6;
+
+	return (u64)-1;
+}
+
+static u64 parse_profile(const char *s)
+{
+	char *pos, *parse_end;
+	int copies = 1;
+	int stripes = 0;
+	int parity = 0;
+	int dup = 0;
+	u64 profile = (u64)-1;
+
+	/* Look for exact match with historical forms first */
 	if (strcmp(s, "raid0") == 0) {
 		return BTRFS_BLOCK_GROUP_RAID0;
 	} else if (strcmp(s, "raid1") == 0) {
@@ -362,12 +392,54 @@ static u64 parse_profile(char *s)
 		return BTRFS_BLOCK_GROUP_DUP;
 	} else if (strcmp(s, "single") == 0) {
 		return 0;
+	}
+
+	/* Attempt to parse new nCmSpP form */
+	/* nC is required and n must be an unsigned decimal number */
+	copies = strtoul(s, &parse_end, 10);
+	if(parse_end == s || (*parse_end != 'c' && *parse_end != 'C'))
+		goto unknown;
+
+	/* C may be followed by D to indicate non-redundant/DUP */
+	pos = parse_end + 1;
+	if(*pos == 'D' || *pos == 'd') {
+		dup = 1;
+		pos++;
+	}
+	if(*pos == 0)
+		goto done;
+
+	/* mS is optional, and m may be an integer, or a literal "m" */
+	if(*pos == 'm') {
+		stripes = -1;
+		parse_end = pos+1;
 	} else {
-		fprintf(stderr, "Unknown profile %s\n", s);
-		print_usage();
+		stripes = strtoul(pos, &parse_end, 10);
 	}
-	/* not reached */
-	return 0;
+	if(parse_end == pos || (*parse_end != 's' && *parse_end != 'S'))
+		goto unknown;
+
+	pos = parse_end + 1;
+	if(*pos == 0)
+		goto done;
+
+	/* pP is optional, and p must be an integer */
+	parity = strtoul(pos, &parse_end, 10);
+	if(parse_end == pos || (*parse_end != 'p' && *parse_end != 'P'))
+		goto unknown;
+	pos = parse_end + 1;
+	if(*pos != 0)
+		goto unknown;
+
+done:
+	profile = make_profile(copies, dup, stripes, parity);
+	if(profile == (u64)-1)
+		fprintf(stderr, "Unknown or unavailable profile '%s'\n", s);
+	return profile;
+
+unknown:
+	fprintf(stderr, "Unparseable profile '%s'\n", s);
+	return (u64)-1;
 }
 
 static char *parse_label(char *input)
@@ -1447,6 +1519,11 @@ int main(int ac, char **av)
 	printf("\nWARNING! - %s IS EXPERIMENTAL\n", BTRFS_BUILD_VERSION);
 	printf("WARNING! - see http://btrfs.wiki.kernel.org before using\n\n");
 
+	if (data_profile == (u64)-1 || metadata_profile == (u64)-1) {
+		fprintf(stderr, "Cannot handle requested replication profile. Aborting\n");
+		exit(1);
+	}
+
 	if (source_dir == 0) {
 		file = av[optind++];
 		ret = is_swap_device(file);
@@ -1666,7 +1743,7 @@ raid_groups:
 
 		flags |= BTRFS_FEATURE_INCOMPAT_RAID56;
 		btrfs_set_super_incompat_flags(super, flags);
-		printf("Setting RAID5/6 feature flag\n");
+		printf("Setting parity-RAID feature flag\n");
 	}
 
 	printf("fs created label %s on %s\n\tnodesize %u leafsize %u "
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 2/5] Move parse_profile to utils.c
  2013-03-09 20:31 [PATCH 0/5] [RFC] RAID-level terminology change Hugo Mills
  2013-03-09 20:31 ` [PATCH 1/5] Use nCmSpP format for mkfs Hugo Mills
@ 2013-03-09 20:31 ` Hugo Mills
  2013-03-09 20:31 ` [PATCH 3/5] Convert balance filter parser to use common nCmSpP replication-level parser Hugo Mills
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 37+ messages in thread
From: Hugo Mills @ 2013-03-09 20:31 UTC (permalink / raw)
  To: linux-btrfs

Make parse_profile a shared function so it can be used across the
code-base.

Signed-off-by: Hugo Mills <hugo@carfax.org.uk>

Conflicts:
	mkfs.c
---
 mkfs.c  |   94 ---------------------------------------------------------------
 utils.c |   94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 utils.h |    1 +
 3 files changed, 95 insertions(+), 94 deletions(-)

diff --git a/mkfs.c b/mkfs.c
index 02a7086..51e545f 100644
--- a/mkfs.c
+++ b/mkfs.c
@@ -348,100 +348,6 @@ static void print_version(void)
 	exit(0);
 }
 
-static u64 make_profile(int copies, int dup, int stripes, int parity)
-{
-	if(copies == 1 && !dup && stripes == 0 && parity == 0)
-		return 0;
-	else if(copies == 2 && dup && stripes == 0 && parity == 0)
-		return BTRFS_BLOCK_GROUP_DUP;
-	else if(copies == 2 && !dup && stripes == 0 && parity == 0)
-		return BTRFS_BLOCK_GROUP_RAID1;
-	else if(copies == 2 && !dup && stripes == -1 && parity == 0)
-		return BTRFS_BLOCK_GROUP_RAID10;
-	else if(copies == 1 && !dup && stripes == -1 && parity == 0)
-		return BTRFS_BLOCK_GROUP_RAID0;
-	else if(copies == 1 && !dup && stripes == -1 && parity == 1)
-		return BTRFS_BLOCK_GROUP_RAID5;
-	else if(copies == 1 && !dup && stripes == -1 && parity == 2)
-		return BTRFS_BLOCK_GROUP_RAID6;
-
-	return (u64)-1;
-}
-
-static u64 parse_profile(const char *s)
-{
-	char *pos, *parse_end;
-	int copies = 1;
-	int stripes = 0;
-	int parity = 0;
-	int dup = 0;
-	u64 profile = (u64)-1;
-
-	/* Look for exact match with historical forms first */
-	if (strcmp(s, "raid0") == 0) {
-		return BTRFS_BLOCK_GROUP_RAID0;
-	} else if (strcmp(s, "raid1") == 0) {
-		return BTRFS_BLOCK_GROUP_RAID1;
-	} else if (strcmp(s, "raid5") == 0) {
-		return BTRFS_BLOCK_GROUP_RAID5;
-	} else if (strcmp(s, "raid6") == 0) {
-		return BTRFS_BLOCK_GROUP_RAID6;
-	} else if (strcmp(s, "raid10") == 0) {
-		return BTRFS_BLOCK_GROUP_RAID10;
-	} else if (strcmp(s, "dup") == 0) {
-		return BTRFS_BLOCK_GROUP_DUP;
-	} else if (strcmp(s, "single") == 0) {
-		return 0;
-	}
-
-	/* Attempt to parse new nCmSpP form */
-	/* nC is required and n must be an unsigned decimal number */
-	copies = strtoul(s, &parse_end, 10);
-	if(parse_end == s || (*parse_end != 'c' && *parse_end != 'C'))
-		goto unknown;
-
-	/* C may be followed by D to indicate non-redundant/DUP */
-	pos = parse_end + 1;
-	if(*pos == 'D' || *pos == 'd') {
-		dup = 1;
-		pos++;
-	}
-	if(*pos == 0)
-		goto done;
-
-	/* mS is optional, and m may be an integer, or a literal "m" */
-	if(*pos == 'm') {
-		stripes = -1;
-		parse_end = pos+1;
-	} else {
-		stripes = strtoul(pos, &parse_end, 10);
-	}
-	if(parse_end == pos || (*parse_end != 's' && *parse_end != 'S'))
-		goto unknown;
-
-	pos = parse_end + 1;
-	if(*pos == 0)
-		goto done;
-
-	/* pP is optional, and p must be an integer */
-	parity = strtoul(pos, &parse_end, 10);
-	if(parse_end == pos || (*parse_end != 'p' && *parse_end != 'P'))
-		goto unknown;
-	pos = parse_end + 1;
-	if(*pos != 0)
-		goto unknown;
-
-done:
-	profile = make_profile(copies, dup, stripes, parity);
-	if(profile == (u64)-1)
-		fprintf(stderr, "Unknown or unavailable profile '%s'\n", s);
-	return profile;
-
-unknown:
-	fprintf(stderr, "Unparseable profile '%s'\n", s);
-	return (u64)-1;
-}
-
 static char *parse_label(char *input)
 {
 	int len = strlen(input);
diff --git a/utils.c b/utils.c
index 1813dda..a6fb246 100644
--- a/utils.c
+++ b/utils.c
@@ -1420,6 +1420,100 @@ u64 parse_size(char *s)
 	return strtoull(s, NULL, 10) * mult;
 }
 
+static u64 make_profile(int copies, int dup, int stripes, int parity)
+{
+	if(copies == 1 && !dup && stripes == 0 && parity == 0)
+		return 0;
+	else if(copies == 2 && dup && stripes == 0 && parity == 0)
+		return BTRFS_BLOCK_GROUP_DUP;
+	else if(copies == 2 && !dup && stripes == 0 && parity == 0)
+		return BTRFS_BLOCK_GROUP_RAID1;
+	else if(copies == 2 && !dup && stripes == -1 && parity == 0)
+		return BTRFS_BLOCK_GROUP_RAID10;
+	else if(copies == 1 && !dup && stripes == -1 && parity == 0)
+		return BTRFS_BLOCK_GROUP_RAID0;
+	else if(copies == 1 && !dup && stripes == -1 && parity == 1)
+		return BTRFS_BLOCK_GROUP_RAID5;
+	else if(copies == 1 && !dup && stripes == -1 && parity == 2)
+		return BTRFS_BLOCK_GROUP_RAID6;
+
+	return (u64)-1;
+}
+
+u64 parse_profile(const char *s)
+{
+	char *pos, *parse_end;
+	int copies = 1;
+	int stripes = 0;
+	int parity = 0;
+	int dup = 0;
+	u64 profile = (u64)-1;
+
+	/* Look for exact match with historical forms first */
+	if (strcmp(s, "raid0") == 0) {
+		return BTRFS_BLOCK_GROUP_RAID0;
+	} else if (strcmp(s, "raid1") == 0) {
+		return BTRFS_BLOCK_GROUP_RAID1;
+	} else if (strcmp(s, "raid5") == 0) {
+		return BTRFS_BLOCK_GROUP_RAID5;
+	} else if (strcmp(s, "raid6") == 0) {
+		return BTRFS_BLOCK_GROUP_RAID6;
+	} else if (strcmp(s, "raid10") == 0) {
+		return BTRFS_BLOCK_GROUP_RAID10;
+	} else if (strcmp(s, "dup") == 0) {
+		return BTRFS_BLOCK_GROUP_DUP;
+	} else if (strcmp(s, "single") == 0) {
+		return 0;
+	}
+
+	/* Attempt to parse new nCmSpP form */
+	/* nC is required and n must be an unsigned decimal number */
+	copies = strtoul(s, &parse_end, 10);
+	if(parse_end == s || (*parse_end != 'c' && *parse_end != 'C'))
+		goto unknown;
+
+	/* C may be followed by D to indicate non-redundant/DUP */
+	pos = parse_end + 1;
+	if(*pos == 'D' || *pos == 'd') {
+		dup = 1;
+		pos++;
+	}
+	if(*pos == 0)
+		goto done;
+
+	/* mS is optional, and m may be an integer, or a literal "m" */
+	if(*pos == 'm') {
+		stripes = -1;
+		parse_end = pos+1;
+	} else {
+		stripes = strtoul(pos, &parse_end, 10);
+	}
+	if(parse_end == pos || (*parse_end != 's' && *parse_end != 'S'))
+		goto unknown;
+
+	pos = parse_end + 1;
+	if(*pos == 0)
+		goto done;
+
+	/* pP is optional, and p must be an integer */
+	parity = strtoul(pos, &parse_end, 10);
+	if(parse_end == pos || (*parse_end != 'p' && *parse_end != 'P'))
+		goto unknown;
+	pos = parse_end + 1;
+	if(*pos != 0)
+		goto unknown;
+
+done:
+	profile = make_profile(copies, dup, stripes, parity);
+	if(profile == (u64)-1)
+		fprintf(stderr, "Unknown or unavailable profile '%s'\n", s);
+	return profile;
+
+unknown:
+	fprintf(stderr, "Unparseable profile '%s'\n", s);
+	return (u64)-1;
+}
+
 int open_file_or_dir(const char *fname)
 {
 	int ret;
diff --git a/utils.h b/utils.h
index 0b681ed..dcaaa7f 100644
--- a/utils.h
+++ b/utils.h
@@ -47,6 +47,7 @@ char *pretty_sizes(u64 size);
 int get_mountpt(char *dev, char *mntpt, size_t size);
 int btrfs_scan_block_devices(int run_ioctl);
 u64 parse_size(char *s);
+u64 parse_profile(const char* s);
 int open_file_or_dir(const char *fname);
 int get_device_info(int fd, u64 devid,
 		    struct btrfs_ioctl_dev_info_args *di_args);
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 3/5] Convert balance filter parser to use common nCmSpP replication-level parser
  2013-03-09 20:31 [PATCH 0/5] [RFC] RAID-level terminology change Hugo Mills
  2013-03-09 20:31 ` [PATCH 1/5] Use nCmSpP format for mkfs Hugo Mills
  2013-03-09 20:31 ` [PATCH 2/5] Move parse_profile to utils.c Hugo Mills
@ 2013-03-09 20:31 ` Hugo Mills
  2013-03-09 20:31 ` [PATCH 4/5] Change output of btrfs fi df to report new (or old) RAID names Hugo Mills
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 37+ messages in thread
From: Hugo Mills @ 2013-03-09 20:31 UTC (permalink / raw)
  To: linux-btrfs

Balance filters are the second location which takes user input of
replication levels. Update this to use the common parser so that we can
provide nCmSpP-style names.

Signed-off-by: Hugo Mills <hugo@carfax.org.uk>
---
 cmds-balance.c |   23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/cmds-balance.c b/cmds-balance.c
index f5dc317..6186963 100644
--- a/cmds-balance.c
+++ b/cmds-balance.c
@@ -42,23 +42,16 @@ static const char balance_cmd_group_info[] =
 
 static int parse_one_profile(const char *profile, u64 *flags)
 {
-	if (!strcmp(profile, "raid0")) {
-		*flags |= BTRFS_BLOCK_GROUP_RAID0;
-	} else if (!strcmp(profile, "raid1")) {
-		*flags |= BTRFS_BLOCK_GROUP_RAID1;
-	} else if (!strcmp(profile, "raid10")) {
-		*flags |= BTRFS_BLOCK_GROUP_RAID10;
-	} else if (!strcmp(profile, "raid5")) {
-		*flags |= BTRFS_BLOCK_GROUP_RAID5;
-	} else if (!strcmp(profile, "raid6")) {
-		*flags |= BTRFS_BLOCK_GROUP_RAID6;
-	} else if (!strcmp(profile, "dup")) {
-		*flags |= BTRFS_BLOCK_GROUP_DUP;
-	} else if (!strcmp(profile, "single")) {
-		*flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-	} else {
+	u64 result;
+
+	result = parse_profile(profile);
+	if (result == (u64)-1) {
 		fprintf(stderr, "Unknown profile '%s'\n", profile);
 		return 1;
+	} else if (result == 0) {
+		*flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+	} else {
+		*flags |= result;
 	}
 
 	return 0;
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 4/5] Change output of btrfs fi df to report new (or old) RAID names
  2013-03-09 20:31 [PATCH 0/5] [RFC] RAID-level terminology change Hugo Mills
                   ` (2 preceding siblings ...)
  2013-03-09 20:31 ` [PATCH 3/5] Convert balance filter parser to use common nCmSpP replication-level parser Hugo Mills
@ 2013-03-09 20:31 ` Hugo Mills
  2013-03-09 20:31 ` [PATCH 5/5] Add man page description for nCmSpP replication levels Hugo Mills
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 37+ messages in thread
From: Hugo Mills @ 2013-03-09 20:31 UTC (permalink / raw)
  To: linux-btrfs

Signed-off-by: Hugo Mills <hugo@carfax.org.uk>
---
 cmds-filesystem.c |  135 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 114 insertions(+), 21 deletions(-)

diff --git a/cmds-filesystem.c b/cmds-filesystem.c
index 2210020..8ecc21a 100644
--- a/cmds-filesystem.c
+++ b/cmds-filesystem.c
@@ -39,24 +39,126 @@ static const char * const filesystem_cmd_group_usage[] = {
 };
 
 static const char * const cmd_df_usage[] = {
-	"btrfs filesystem df <path>",
+	"btrfs filesystem df [options] <path>",
 	"Show space usage information for a mount point",
+	"",
+	"-r      Use old-style RAID-n terminology",
 	NULL
 };
 
+#define RAID_NAMES_NEW 0
+#define RAID_NAMES_OLD 1
+
+static int write_raid_name(char* buffer, int size, u64 flags, int old_raid)
+{
+	int copies, stripes, parity;
+	int out;
+	int written = 0;
+
+	if (old_raid == RAID_NAMES_OLD) {
+		if (flags & BTRFS_BLOCK_GROUP_RAID0) {
+			return snprintf(buffer, size, "%s", "RAID0");
+		} else if (flags & BTRFS_BLOCK_GROUP_RAID1) {
+			return snprintf(buffer, size, "%s", "RAID1");
+		} else if (flags & BTRFS_BLOCK_GROUP_DUP) {
+			return snprintf(buffer, size, "%s", "DUP");
+		} else if (flags & BTRFS_BLOCK_GROUP_RAID10) {
+			return snprintf(buffer, size, "%s", "RAID10");
+		} else if (flags & BTRFS_BLOCK_GROUP_RAID5) {
+			return snprintf(buffer, size, "%s", "RAID5");
+		} else if (flags & BTRFS_BLOCK_GROUP_RAID6) {
+			return snprintf(buffer, size, "%s", "RAID6");
+		}
+		return 0;
+	}
+
+	if (flags & (BTRFS_BLOCK_GROUP_RAID1
+				 | BTRFS_BLOCK_GROUP_RAID10
+				 | BTRFS_BLOCK_GROUP_DUP)) {
+		copies = 2;
+	} else {
+		copies = 1;
+	}
+
+	out = snprintf(buffer, size, "%dC", copies);
+	if (size < out)
+		return written + size;
+	written += out;
+	size -= out;
+
+	if (flags & BTRFS_BLOCK_GROUP_DUP) {
+		out = snprintf(buffer+written, size, "D");
+		if (size < out)
+			return written + size;
+		written += out;
+		size -= out;
+	}
+
+	if (flags & (BTRFS_BLOCK_GROUP_RAID0
+				 | BTRFS_BLOCK_GROUP_RAID10
+				 | BTRFS_BLOCK_GROUP_RAID5
+				 | BTRFS_BLOCK_GROUP_RAID6)) {
+		stripes = -1;
+	} else {
+		stripes = 0;
+	}
+
+	if (stripes == -1) {
+		out = snprintf(buffer+written, size, "mS");
+	} else if (stripes == 0) {
+		out = 0;
+	} else {
+		out = snprintf(buffer+written, size, "%dS", stripes);
+	}
+
+	if (size < out)
+		return written + size;
+	written += out;
+	size -= out;
+
+	if (flags & BTRFS_BLOCK_GROUP_RAID5) {
+		parity = 1;
+	} else if (flags & BTRFS_BLOCK_GROUP_RAID6) {
+		parity = 2;
+	} else {
+		parity = 0;
+	}
+
+	if (parity == 0) {
+		out = 0;
+	} else {
+		out = snprintf(buffer+written, size, "%dP", parity);
+	}
+
+	if (size < out)
+		return written + size;
+	written += out;
+	size -= out;
+
+	return written;
+}
+
 static int cmd_df(int argc, char **argv)
 {
 	struct btrfs_ioctl_space_args *sargs, *sargs_orig;
+	int path_start = 1;
 	u64 count = 0, i;
 	int ret;
 	int fd;
 	int e;
 	char *path;
+	int old_raid;
 
-	if (check_argc_exact(argc, 2))
+	old_raid = RAID_NAMES_NEW;
+	if (argc > 1 && (!strcmp(argv[1], "--raid") || !strcmp(argv[1], "-r"))) {
+		old_raid = RAID_NAMES_OLD;
+		path_start += 1;
+	}
+
+	if (check_argc_exact(argc, path_start+1))
 		usage(cmd_df_usage);
 
-	path = argv[1];
+	path = argv[path_start];
 
 	fd = open_file_or_dir(path);
 	if (fd < 0) {
@@ -135,24 +237,15 @@ static int cmd_df(int argc, char **argv)
 			written += 8;
 		}
 
-		if (flags & BTRFS_BLOCK_GROUP_RAID0) {
-			snprintf(description+written, 8, "%s", ", RAID0");
-			written += 7;
-		} else if (flags & BTRFS_BLOCK_GROUP_RAID1) {
-			snprintf(description+written, 8, "%s", ", RAID1");
-			written += 7;
-		} else if (flags & BTRFS_BLOCK_GROUP_DUP) {
-			snprintf(description+written, 6, "%s", ", DUP");
-			written += 5;
-		} else if (flags & BTRFS_BLOCK_GROUP_RAID10) {
-			snprintf(description+written, 9, "%s", ", RAID10");
-			written += 8;
-		} else if (flags & BTRFS_BLOCK_GROUP_RAID5) {
-			snprintf(description+written, 9, "%s", ", RAID5");
-			written += 7;
-		} else if (flags & BTRFS_BLOCK_GROUP_RAID6) {
-			snprintf(description+written, 9, "%s", ", RAID6");
-			written += 7;
+		if((flags & ~(BTRFS_BLOCK_GROUP_DATA
+					  | BTRFS_BLOCK_GROUP_SYSTEM
+					  | BTRFS_BLOCK_GROUP_METADATA)) != 0) {
+			snprintf(description+written, 3, ", ");
+			written += 2;
+			written += write_raid_name(description+written,
+									   sizeof(description)-written,
+									   flags,
+									   old_raid);
 		}
 
 		total_bytes = pretty_sizes(sargs->spaces[i].total_bytes);
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 5/5] Add man page description for nCmSpP replication levels
  2013-03-09 20:31 [PATCH 0/5] [RFC] RAID-level terminology change Hugo Mills
                   ` (3 preceding siblings ...)
  2013-03-09 20:31 ` [PATCH 4/5] Change output of btrfs fi df to report new (or old) RAID names Hugo Mills
@ 2013-03-09 20:31 ` Hugo Mills
  2013-03-10 14:01   ` Goffredo Baroncelli
  2013-03-09 21:38 ` [PATCH 0/5] [RFC] RAID-level terminology change Harald Glatt
                   ` (5 subsequent siblings)
  10 siblings, 1 reply; 37+ messages in thread
From: Hugo Mills @ 2013-03-09 20:31 UTC (permalink / raw)
  To: linux-btrfs

Signed-off-by: Hugo Mills <hugo@carfax.org.uk>
---
 man/btrfs.8.in      |    9 +++++++++
 man/mkfs.btrfs.8.in |   24 +++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/man/btrfs.8.in b/man/btrfs.8.in
index 94f4ffe..2799ec7 100644
--- a/man/btrfs.8.in
+++ b/man/btrfs.8.in
@@ -25,6 +25,8 @@ btrfs \- control a btrfs filesystem
 [-s \fIstart\fR] [-t \fIsize\fR] -[vf] <\fIfile\fR>|<\fIdir\fR> \
 [<\fIfile\fR>|<\fIdir\fR>...]
 .PP
+\fBbtrfs\fP \fBfilesystem df\fP [-r]\fI <path> \fP
+.PP
 \fBbtrfs\fP \fBfilesystem sync\fP\fI <path> \fP
 .PP
 \fBbtrfs\fP \fBfilesystem resize\fP\fI [devid:][+/\-]<size>[gkm]|[devid:]max <filesystem>\fP
@@ -217,6 +219,13 @@ don't use it if you use snapshots, have de-duplicated your data or made
 copies with \fBcp --reflink\fP.
 .TP
 
+\fBfilesystem df\fR [-r] \fI<path>\fR
+Show usage information for the filesystem identified by \fI<path>\fR.
+
+\fB-r, --raid\fP Use old-style "RAID-n" terminology to show replication types
+
+.TP
+
 \fBfilesystem sync\fR\fI <path> \fR
 Force a sync for the filesystem identified by \fI<path>\fR.
 .TP
diff --git a/man/mkfs.btrfs.8.in b/man/mkfs.btrfs.8.in
index 41163e0..2e71e65 100644
--- a/man/mkfs.btrfs.8.in
+++ b/man/mkfs.btrfs.8.in
@@ -37,7 +37,29 @@ mkfs.btrfs uses all the available storage for the filesystem.
 .TP
 \fB\-d\fR, \fB\-\-data \fItype\fR
 Specify how the data must be spanned across the devices specified. Valid
-values are raid0, raid1, raid10 or single.
+values are of the form <n>C[D][<m>S[<p>P]], where <n> is the number of copies
+of data, <m> is the number of stripes per copy, and <p> is the number of parity
+stripes. The <m> parameter must (currently) be a literal "m", indicating that
+as many stripes as possible will be used. The letter D may be added to the
+number of copies, to indicate non-redundant copies (e.g. on the same device).
+
+The following deprecated values may also be used:
+.RS 16
+.P
+single	1C
+.P
+raid0	1CmS
+.P
+raid1	2C
+.P
+dup		2CD
+.P
+raid10	2CmS
+.P
+raid5	1CmS1P
+.P
+raid6	1CmS2P
+.RS -16
 .TP
 \fB\-f\fR
 Force overwrite when an existing filesystem is detected on the device.
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-09 20:31 [PATCH 0/5] [RFC] RAID-level terminology change Hugo Mills
                   ` (4 preceding siblings ...)
  2013-03-09 20:31 ` [PATCH 5/5] Add man page description for nCmSpP replication levels Hugo Mills
@ 2013-03-09 21:38 ` Harald Glatt
       [not found] ` <CAFWF=am4ki529Zez4123gYk3BD+Z9RONRpAK7NZe=skHzcdMiw@mail.gmail.com>
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 37+ messages in thread
From: Harald Glatt @ 2013-03-09 21:38 UTC (permalink / raw)
  To: linux-btrfs

On Sat, Mar 9, 2013 at 9:31 PM, Hugo Mills <hugo@carfax.org.uk> wrote:
>    Some time ago, and occasionally since, we've discussed altering the
> "RAID-n" terminology to change it to an "nCmSpP" format, where n is the
> number of copies, m is the number of (data) devices in a stripe per copy,
> and p is the number of parity devices in a stripe.
>
>    The current kernel implementation uses as many devices as it can in the
> striped modes (RAID-0, -10, -5, -6), and in this implementation, that is
> written as "mS" (with a literal "m"). The mS and pP sections are omitted
> if the value is 1S or 0P.
>
>    The magic look-up table for old-style / new-style is:
>
> single           1C (or omitted, in btrfs fi df output)
> RAID-0           1CmS
> RAID-1           2C
> DUP                      2CD
> RAID-10          2CmS
> RAID-5           1CmS1P
> RAID-6           1CmS2P
>
>    The following patch set modifies userspace tools to accept C/S/P formats
> in input (mkfs and the restriper). The older formats are also accepted. It
> also prints the newer formats by default in btrfs fi df, with an option to
> show the older format for the traditionalists.
>
>    I'm not sure whether we should omit the 1C in btrfs fi df output, or
> make it explicit even in the "single" case.
>
>    Hugo.
>
> Hugo Mills (5):
>   Use nCmSpP format for mkfs
>   Move parse_profile to utils.c
>   Convert balance filter parser to use common nCmSpP replication-level
>     parser
>   Change output of btrfs fi df to report new (or old) RAID names
>   Add man page description for nCmSpP replication levels
>
>  cmds-balance.c      |   23 +++------
>  cmds-filesystem.c   |  135 +++++++++++++++++++++++++++++++++++++++++++--------
>  man/btrfs.8.in      |    9 ++++
>  man/mkfs.btrfs.8.in |   24 ++++++++-
>  mkfs.c              |   35 ++++---------
>  utils.c             |   94 +++++++++++++++++++++++++++++++++++
>  utils.h             |    1 +
>  7 files changed, 258 insertions(+), 63 deletions(-)
>
> --
> 1.7.10.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

I vote make it explicit to give people a chance to understand the new
way it works even with just one device. The idea for this new naming
scheme is great, I like it!! Will btrfs allow for things like 2C10SP2
too? Basically two RAID6 arrays with 12 drives each (10 for data, 2
for parity), if I understand it right with the entire array being
mirrored to another one.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
       [not found] ` <CAFWF=am4ki529Zez4123gYk3BD+Z9RONRpAK7NZe=skHzcdMiw@mail.gmail.com>
@ 2013-03-09 21:46   ` Hugo Mills
  0 siblings, 0 replies; 37+ messages in thread
From: Hugo Mills @ 2013-03-09 21:46 UTC (permalink / raw)
  To: Harald Glatt; +Cc: linux-btrfs

[-- Attachment #1: Type: text/plain, Size: 4302 bytes --]

On Sat, Mar 09, 2013 at 10:31:25PM +0100, Harald Glatt wrote:
> On Sat, Mar 9, 2013 at 9:31 PM, Hugo Mills <hugo@carfax.org.uk> wrote:
> 
> >    Some time ago, and occasionally since, we've discussed altering the
> > "RAID-n" terminology to change it to an "nCmSpP" format, where n is the
> > number of copies, m is the number of (data) devices in a stripe per copy,
> > and p is the number of parity devices in a stripe.
> >
> >    The current kernel implementation uses as many devices as it can in the
> > striped modes (RAID-0, -10, -5, -6), and in this implementation, that is
> > written as "mS" (with a literal "m"). The mS and pP sections are omitted
> > if the value is 1S or 0P.
> >
> >    The magic look-up table for old-style / new-style is:
> >
> > single           1C (or omitted, in btrfs fi df output)
> > RAID-0           1CmS
> > RAID-1           2C
> > DUP                      2CD
> > RAID-10          2CmS
> > RAID-5           1CmS1P
> > RAID-6           1CmS2P
> >
> >    The following patch set modifies userspace tools to accept C/S/P formats
> > in input (mkfs and the restriper). The older formats are also accepted. It
> > also prints the newer formats by default in btrfs fi df, with an option to
> > show the older format for the traditionalists.
> >
> >    I'm not sure whether we should omit the 1C in btrfs fi df output, or
> > make it explicit even in the "single" case.
> >
> >    Hugo.
> >
> > Hugo Mills (5):
> >   Use nCmSpP format for mkfs
> >   Move parse_profile to utils.c
> >   Convert balance filter parser to use common nCmSpP replication-level
> >     parser
> >   Change output of btrfs fi df to report new (or old) RAID names
> >   Add man page description for nCmSpP replication levels
> >
> >  cmds-balance.c      |   23 +++------
> >  cmds-filesystem.c   |  135
> > +++++++++++++++++++++++++++++++++++++++++++--------
> >  man/btrfs.8.in      |    9 ++++
> >  man/mkfs.btrfs.8.in |   24 ++++++++-
> >  mkfs.c              |   35 ++++---------
> >  utils.c             |   94 +++++++++++++++++++++++++++++++++++
> >  utils.h             |    1 +
> >  7 files changed, 258 insertions(+), 63 deletions(-)
> >
> > --
> > 1.7.10.4
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
> 
> I vote make it explicit to give people a chance to understand the new way
> it works even with just one device. The idea for this new naming scheme is
> great, I like it!! Will btrfs allow for things like 2C10SP2 too? Basically
> two RAID6 arrays with 12 drives each (10 for data, 2 for parity), if I
> understand it right with the entire array being mirrored to another one.

   You wouldn't necessarily get exactly the same semantics. 2C10S2P
would require at least 24 drives, and in the traditional "mirrored
pair of RAID-6" configuration, you could lose drives 1-14, and still
rebuild successfully from the the remaining degraded RAID-6 array. The
btrfs implementation probably wouldn't let you go so far -- you'd only
get a guarantee of being able to lose any four devices; anything after
that would be quite likely to destroy the array, because the
allocation of "position" to device in any given stripe isn't as
regimented as for the traditional RAID configurations.

   Basically, both would provide the same minimum safety guarantees,
but the odds of the btrfs array losing (some) data after that point
are considerably higher. This is all moot for now, anyway, because as
far as I know there aren't any plans for generalising this completely.

   Chris is definitely planning fixed values for mS (so you can ask
for, say, exactly 4 stripes and one parity), and values for nC greater
than 2. As far as I know, there aren't any plans for nC > 1 and pP > 0
together. I haven't got far enough into the kernel code to work out
whether that's simple or not to implement.

   Hugo.

-- 
=== Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk ===
  PGP key: 515C238D from wwwkeys.eu.pgp.net or http://www.carfax.org.uk
      --- What's a Nazgûl like you doing in a place like this? ---      

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-09 20:31 [PATCH 0/5] [RFC] RAID-level terminology change Hugo Mills
                   ` (6 preceding siblings ...)
       [not found] ` <CAFWF=am4ki529Zez4123gYk3BD+Z9RONRpAK7NZe=skHzcdMiw@mail.gmail.com>
@ 2013-03-09 22:25 ` Roger Binns
  2013-03-10  1:44   ` Hugo Mills
  2013-03-10 11:23 ` Martin Steigerwald
                   ` (2 subsequent siblings)
  10 siblings, 1 reply; 37+ messages in thread
From: Roger Binns @ 2013-03-09 22:25 UTC (permalink / raw)
  To: linux-btrfs

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 09/03/13 12:31, Hugo Mills wrote:
> Some time ago, and occasionally since, we've discussed altering the 
> "RAID-n" terminology to change it to an "nCmSpP" format, where n is
> the number of copies, m is the number of (data) devices in a stripe per
> copy, and p is the number of parity devices in a stripe.

I despise both terminologies because they mix up administrator goals with
how those goals are provided by the filesystem.

Using RAID0 as an example, what is actually desired is maximum performance
and there is no need to survive the failure of even a single disk. I don't
actually care if it uses striping, parity, hot data tracking, moving
things to faster outside edges of spinning disks, hieroglyphics, rot13
encoding, all of the above or anything else.

Maximum performance is always desired and "RAID" settings really track to
"data must survive the failure of N disks" and/or "data must be accessible
if at least N disks are present".  As an administrator that is what I
would like to set and let the filesystem do whatever is necessary to meet
those goals  (I'd love to be able to set this on a per directory/file
basis too.)

Roger
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.11 (GNU/Linux)

iEYEARECAAYFAlE7ttUACgkQmOOfHg372QT9LwCgg8lxpxC/w8E5dTsQ3Qx4ujWh
esQAnR2pKmwrJndsvynDia88KsrzJ9m9
=vv3v
-----END PGP SIGNATURE-----


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-09 22:25 ` Roger Binns
@ 2013-03-10  1:44   ` Hugo Mills
  2013-03-10  5:41     ` Roger Binns
  0 siblings, 1 reply; 37+ messages in thread
From: Hugo Mills @ 2013-03-10  1:44 UTC (permalink / raw)
  To: Roger Binns; +Cc: linux-btrfs

[-- Attachment #1: Type: text/plain, Size: 3681 bytes --]

On Sat, Mar 09, 2013 at 02:25:25PM -0800, Roger Binns wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> On 09/03/13 12:31, Hugo Mills wrote:
> > Some time ago, and occasionally since, we've discussed altering the 
> > "RAID-n" terminology to change it to an "nCmSpP" format, where n is
> > the number of copies, m is the number of (data) devices in a stripe per
> > copy, and p is the number of parity devices in a stripe.
> 
> I despise both terminologies because they mix up administrator goals with
> how those goals are provided by the filesystem.
> 
> Using RAID0 as an example, what is actually desired is maximum performance
> and there is no need to survive the failure of even a single disk. I don't
> actually care if it uses striping, parity, hot data tracking, moving
> things to faster outside edges of spinning disks, hieroglyphics, rot13
> encoding, all of the above or anything else.
> 
> Maximum performance is always desired and "RAID" settings really track to
> "data must survive the failure of N disks" and/or "data must be accessible
> if at least N disks are present".  As an administrator that is what I
> would like to set and let the filesystem do whatever is necessary to meet
> those goals  (I'd love to be able to set this on a per directory/file
> basis too.)

   You've got at least three independent parameters to the system in
order to make that choice, though, and it's a fairly fuzzy decision
problem. You've got:

 - Device redundancy
 - Storage overhead
 - Performance

   So, for example, if you have 6 devices, and specify device
redundancy of 1 (i.e. you can lose at least one device without losing
any data), you have many options:

 - 2CmS    (50%)
 - 2C3S    (50%)
 - 2C2S    (50%)
 - 1C2S1P  (66%)
 - 1C3S1P  (75%)
 - 1C4S1P  (80%)
 - 1C5S1P  (83%)
 - 1CmS1P  (50%-83%)

where the figure in brackets is the storage-to-disk ratio, and the
list is in approximate decreasing order of mean performance(*). Using
these criteria for selecting a suitable operating mode isn't
impossible, but it's not a cut-and-dried thing, what with the
trade-off between storage ratio (which is easily-computed) and
performance (which isn't).

   I definitely want to report the results in nCmSpP form, which tells
you what it's actually done. The internal implementation, while not
expressing the full gamut of possibilities, maps directly from the
internal configuration to that form, and so it should at least be an
allowable input for configuration (e.g. mkfs.btrfs and the restriper).

   If you'd like to suggest a usable set of configuration axes [say,
(redundancy, overhead) ], and a set of rules for converting
those requirements to the internal representation, then there's no
reason we can't add them as well in a later set of patches.

   Note that for a given set of (redundancy, overhead) requirements,
the optimal operating point can change as the number of devices
changes. However, since the management of the number of devices is
pretty much entirely driven through userspace, we can handle all of
this in userspace. So if you want (1, 75%), and add a new disk, you'd
rebalance with (1, 75%) as the replication config parameter, and we
can convert that in userspace into the appropriate internal
representation to feed to the kernel.

   Hugo.

(*) Source: the inside of my head. Your mileage may vary. The value of
this prediction may go down as well as up. Contents may have settled
in transit. etc...

-- 
=== Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk ===
  PGP key: 515C238D from wwwkeys.eu.pgp.net or http://www.carfax.org.uk
                 --- emacs: Eats Memory and Crashes. ---                 

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10  1:44   ` Hugo Mills
@ 2013-03-10  5:41     ` Roger Binns
  2013-03-10  6:29       ` Harald Glatt
  2013-03-10 22:04       ` Hugo Mills
  0 siblings, 2 replies; 37+ messages in thread
From: Roger Binns @ 2013-03-10  5:41 UTC (permalink / raw)
  To: linux-btrfs

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 09/03/13 17:44, Hugo Mills wrote:
> You've got at least three independent parameters to the system in order
> to make that choice, though, and it's a fairly fuzzy decision problem.
> You've got:
> 
> - Device redundancy - Storage overhead - Performance

Overhead and performance aren't separate goals.  More accurately the goal
is best performance given the devices available and constrained by redundancy.

If I have 1GB of unique data and 10GB of underlying space available then
feel free to make 9 additional copies of each piece of data if that helps
performance.  As I increase the unique data the overhead available will
decrease, but I doubt anyone has a goal of micromanaging overhead usage.
Why can't the filesystem just figure it out and do the best job available
given minimal constraints?

> I definitely want to report the results in nCmSpP form, which tells you
> what it's actually done. The internal implementation, while not 
> expressing the full gamut of possibilities, maps directly from the 
> internal configuration to that form, and so it should at least be an 
> allowable input for configuration (e.g. mkfs.btrfs and the restriper).

Agreed on that for the micromanagers :-)

> If you'd like to suggest a usable set of configuration axes [say, 
> (redundancy, overhead) ], and a set of rules for converting those
> requirements to the internal representation, then there's no reason we
> can't add them as well in a later set of patches.

The only constraints that matter are surviving N device failures, and data
not lost if at least N devices are still present.  Under the hood the best
way of meeting those can be heuristically determined, and I'd expect
things like overhead to dynamically adjust as storage fills up or empties.

Roger
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.11 (GNU/Linux)

iEYEARECAAYFAlE8HR0ACgkQmOOfHg372QSyngCgpE9PTyBl3MsJ1kCYODtQWno/
85cAn0dcqE8ZWhOpFbZnQISmpe/KYceN
=LTf8
-----END PGP SIGNATURE-----


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10  5:41     ` Roger Binns
@ 2013-03-10  6:29       ` Harald Glatt
  2013-03-10  6:37         ` Harald Glatt
  2013-03-10 22:04       ` Hugo Mills
  1 sibling, 1 reply; 37+ messages in thread
From: Harald Glatt @ 2013-03-10  6:29 UTC (permalink / raw)
  To: Roger Binns; +Cc: linux-btrfs

On Sun, Mar 10, 2013 at 6:41 AM, Roger Binns <rogerb@rogerbinns.com> wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> On 09/03/13 17:44, Hugo Mills wrote:
>> You've got at least three independent parameters to the system in order
>> to make that choice, though, and it's a fairly fuzzy decision problem.
>> You've got:
>>
>> - Device redundancy - Storage overhead - Performance
>
> Overhead and performance aren't separate goals.  More accurately the goal
> is best performance given the devices available and constrained by redundancy.
>
> If I have 1GB of unique data and 10GB of underlying space available then
> feel free to make 9 additional copies of each piece of data if that helps
> performance.  As I increase the unique data the overhead available will
> decrease, but I doubt anyone has a goal of micromanaging overhead usage.
> Why can't the filesystem just figure it out and do the best job available
> given minimal constraints?
>
>> I definitely want to report the results in nCmSpP form, which tells you
>> what it's actually done. The internal implementation, while not
>> expressing the full gamut of possibilities, maps directly from the
>> internal configuration to that form, and so it should at least be an
>> allowable input for configuration (e.g. mkfs.btrfs and the restriper).
>
> Agreed on that for the micromanagers :-)
>
>> If you'd like to suggest a usable set of configuration axes [say,
>> (redundancy, overhead) ], and a set of rules for converting those
>> requirements to the internal representation, then there's no reason we
>> can't add them as well in a later set of patches.
>
> The only constraints that matter are surviving N device failures, and data
> not lost if at least N devices are still present.  Under the hood the best
> way of meeting those can be heuristically determined, and I'd expect
> things like overhead to dynamically adjust as storage fills up or empties.
>
> Roger
> -----BEGIN PGP SIGNATURE-----
> Version: GnuPG v1.4.11 (GNU/Linux)
>
> iEYEARECAAYFAlE8HR0ACgkQmOOfHg372QSyngCgpE9PTyBl3MsJ1kCYODtQWno/
> 85cAn0dcqE8ZWhOpFbZnQISmpe/KYceN
> =LTf8
> -----END PGP SIGNATURE-----
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Very good points,

I was also gonna write something by the lines of 'all that matters is
achieving the minimum amount of redundancy, as requested by the user,
at the maximum possible performance'.

After reading your post now, Roger, I'm much more clear on what I
actually wanted to say, which is pretty much the same thing:

In paradise really all I would have to tell btrfs is how many drives
I'm willing to give away so that they will be used exclusively for
redundancy. Everything else btrfs should figure out by itself. Not
just because it's simpler for the user, but also because btrfs
actually is in a position to KNOW better.

As Roger said, as long as the given minimum redundancy quota is
filled, btrfs could make choices that favor either even more
redundancy or more performance based on my usage of the filesystem by
meassuring things like throughput, ioops or space used. I could
imagine that a snail filesystem that barely fills and doesn't do whole
lot could easily work itself up on building huge redunancy, while a
filesystem that requires high performance would do excessive striping
to achieve maximum performance, while only keeping the minimum
requested redundancy intact.

It sounds quite futuristic to me, but it is definitely something that
we have to achieve hopefully rather sooner than later :)

I'm looking forward to it!

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10  6:29       ` Harald Glatt
@ 2013-03-10  6:37         ` Harald Glatt
  2013-03-10 11:31           ` Martin Steigerwald
  2013-03-10 11:48           ` Roger Binns
  0 siblings, 2 replies; 37+ messages in thread
From: Harald Glatt @ 2013-03-10  6:37 UTC (permalink / raw)
  To: Roger Binns; +Cc: linux-btrfs

> Very good points,
>
> I was also gonna write something by the lines of 'all that matters is
> achieving the minimum amount of redundancy, as requested by the user,
> at the maximum possible performance'.
>
> After reading your post now, Roger, I'm much more clear on what I
> actually wanted to say, which is pretty much the same thing:
>
> In paradise really all I would have to tell btrfs is how many drives
> I'm willing to give away so that they will be used exclusively for
> redundancy. Everything else btrfs should figure out by itself. Not
> just because it's simpler for the user, but also because btrfs
> actually is in a position to KNOW better.
>
> As Roger said, as long as the given minimum redundancy quota is
> filled, btrfs could make choices that favor either even more
> redundancy or more performance based on my usage of the filesystem by
> meassuring things like throughput, ioops or space used. I could
> imagine that a snail filesystem that barely fills and doesn't do whole
> lot could easily work itself up on building huge redunancy, while a
> filesystem that requires high performance would do excessive striping
> to achieve maximum performance, while only keeping the minimum
> requested redundancy intact.
>
> It sounds quite futuristic to me, but it is definitely something that
> we have to achieve hopefully rather sooner than later :)
>
> I'm looking forward to it!

I have to add something to my own message: Even the notion of thinking
in 'how many devices do I want to give away for redundancy' is
outdated... What it really comes down to is how much space am I
willing to sacrifice so that my reliablilty is increasing. Rather than
addressing that at a per-drive level with a futuristic fs like btrfs I
think setting a percent value of total space would be best. Here are
my n hard drives, I want to give up thirty percent of maximum space so
that basically I can lose that amount of space across any device
combination and be safe. Do this for me and do it at maximum possible
performance with the device count, and type, and sizes that I've given
you. And if I'm not using the filesystem much, if I have tons of space
free, feel free to build even more redundancy while idle.

This is pretty much what it would be like in a perfect world, in my opinion :)

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-09 20:31 [PATCH 0/5] [RFC] RAID-level terminology change Hugo Mills
                   ` (7 preceding siblings ...)
  2013-03-09 22:25 ` Roger Binns
@ 2013-03-10 11:23 ` Martin Steigerwald
  2013-03-10 14:11   ` Goffredo Baroncelli
                     ` (2 more replies)
  2013-03-10 15:43 ` Goffredo Baroncelli
  2013-03-10 23:55 ` sam tygier
  10 siblings, 3 replies; 37+ messages in thread
From: Martin Steigerwald @ 2013-03-10 11:23 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Hugo Mills

Hi Hugo,

Am Samstag, 9. März 2013 schrieb Hugo Mills:
>    Some time ago, and occasionally since, we've discussed altering the
> "RAID-n" terminology to change it to an "nCmSpP" format, where n is the
> number of copies, m is the number of (data) devices in a stripe per copy,
> and p is the number of parity devices in a stripe.
> 
>    The current kernel implementation uses as many devices as it can in
> the striped modes (RAID-0, -10, -5, -6), and in this implementation,
> that is written as "mS" (with a literal "m"). The mS and pP sections are
> omitted if the value is 1S or 0P.
> 
>    The magic look-up table for old-style / new-style is:
> 
> single           1C (or omitted, in btrfs fi df output)
> RAID-0           1CmS
> RAID-1           2C
> DUP                      2CD

What does the "D" in "2CD" mean? Its not explained above, unless I miss 
something.

> RAID-10          2CmS
> RAID-5           1CmS1P
> RAID-6           1CmS2P

I think its great to clarify the RAID-level terminology, but I find the new 
notation a bit, hmmm, cryptic.

Maybe for displaying it would be nice to show a more verbose format like

2 copies, many stripes, 1 parity (1CmS1P)

by default and the abbreviated one in parentheses?

Any other idea to make it less cryptic?

Thanks,
-- 
Martin 'Helios' Steigerwald - http://www.Lichtvoll.de
GPG: 03B0 0D6C 0040 0710 4AFA  B82F 991B EAAC A599 84C7

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10  6:37         ` Harald Glatt
@ 2013-03-10 11:31           ` Martin Steigerwald
  2013-03-10 11:48           ` Roger Binns
  1 sibling, 0 replies; 37+ messages in thread
From: Martin Steigerwald @ 2013-03-10 11:31 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Harald Glatt, Roger Binns

Am Sonntag, 10. März 2013 schrieb Harald Glatt:
> > Very good points,
> > 
> > I was also gonna write something by the lines of 'all that matters is
> > achieving the minimum amount of redundancy, as requested by the user,
> > at the maximum possible performance'.
> > 
> > After reading your post now, Roger, I'm much more clear on what I
> > actually wanted to say, which is pretty much the same thing:
> > 
> > In paradise really all I would have to tell btrfs is how many drives
> > I'm willing to give away so that they will be used exclusively for
> > redundancy. Everything else btrfs should figure out by itself. Not
> > just because it's simpler for the user, but also because btrfs
> > actually is in a position to KNOW better.
[…]
> > It sounds quite futuristic to me, but it is definitely something that
> > we have to achieve hopefully rather sooner than later :)
> > 
> > I'm looking forward to it!
> 
> I have to add something to my own message: Even the notion of thinking
> in 'how many devices do I want to give away for redundancy' is
> outdated... What it really comes down to is how much space am I
> willing to sacrifice so that my reliablilty is increasing. Rather than
> addressing that at a per-drive level with a futuristic fs like btrfs I
> think setting a percent value of total space would be best. Here are
> my n hard drives, I want to give up thirty percent of maximum space so
> that basically I can lose that amount of space across any device
> combination and be safe. Do this for me and do it at maximum possible
> performance with the device count, and type, and sizes that I've given
> you. And if I'm not using the filesystem much, if I have tons of space
> free, feel free to build even more redundancy while idle.
> 
> This is pretty much what it would be like in a perfect world, in my
> opinion :) --

Still, the way to a perfect world is done in steps. Thus I agree with Hugo 
to start somewhere and I think for a admin who wants to know and specify 
exactly what is going one the patches Hugo offered for discussions are a 
huge step forward.

I think they are a good base to build upon.

-- 
Martin 'Helios' Steigerwald - http://www.Lichtvoll.de
GPG: 03B0 0D6C 0040 0710 4AFA  B82F 991B EAAC A599 84C7

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10  6:37         ` Harald Glatt
  2013-03-10 11:31           ` Martin Steigerwald
@ 2013-03-10 11:48           ` Roger Binns
  1 sibling, 0 replies; 37+ messages in thread
From: Roger Binns @ 2013-03-10 11:48 UTC (permalink / raw)
  To: linux-btrfs

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 09/03/13 22:37, Harald Glatt wrote:
> I have to add something to my own message: Even the notion of thinking 
> in 'how many devices do I want to give away for redundancy' is 
> outdated...

Devices are the only real tangible thing that you can actually point at.
They are the final physical unit and what gets added or replaced, and can
be put in another machine for recovery.

> What it really comes down to is how much space am I willing to
> sacrifice so that my reliablilty is increasing.

The answer as a whole is simple - you want all unused space for
reliability and performance.  If you put 5GB of data on 10GB of space then
50% is the value.

> Rather than addressing that at a per-drive level with a futuristic fs
> like btrfs I think setting a percent value of total space would be
> best.

The problem is that drives are what fail, can be added or removed.  A
percent value makes that considerably harder to deal with especially
changes after the initial setup.

It would be nice to be able to indicate that same data/files/directories
are more important than others (eg cache/spool/trash directories are
unimportant, my documents and photos are very important).

> This is pretty much what it would be like in a perfect world, in my
> opinion :)

Conceptually I want to point at the drives and say "do the right thing"
without any further configuration, monitoring or micro-managing.  In the
short/medium term I'd even be happy to run a btrfs cron job that digs
around, rearranges things and gives me a final report with a colour coding
homeland security style.  When it hits yellow, it is time to delete files
or add more storage.

Roger

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.11 (GNU/Linux)

iEYEARECAAYFAlE8cwIACgkQmOOfHg372QSjUACfcDn1lB/wHXZH9E5a5elUlAT3
Y+QAoL9cNtjMpdZtqYH+t6QXTcOYwYBy
=UMKJ
-----END PGP SIGNATURE-----


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 5/5] Add man page description for nCmSpP replication levels
  2013-03-09 20:31 ` [PATCH 5/5] Add man page description for nCmSpP replication levels Hugo Mills
@ 2013-03-10 14:01   ` Goffredo Baroncelli
  2013-03-10 17:20     ` Hugo Mills
  0 siblings, 1 reply; 37+ messages in thread
From: Goffredo Baroncelli @ 2013-03-10 14:01 UTC (permalink / raw)
  To: Hugo Mills; +Cc: linux-btrfs

Hi Hugo,

could you please add also to the btrfs man page a section where are
described the nCmSpP levels ?

Thanks.
GB



On 03/09/2013 09:31 PM, Hugo Mills wrote:
> Signed-off-by: Hugo Mills <hugo@carfax.org.uk>
> ---
>  man/btrfs.8.in      |    9 +++++++++
>  man/mkfs.btrfs.8.in |   24 +++++++++++++++++++++++-
>  2 files changed, 32 insertions(+), 1 deletion(-)
> 
> diff --git a/man/btrfs.8.in b/man/btrfs.8.in
> index 94f4ffe..2799ec7 100644
> --- a/man/btrfs.8.in
> +++ b/man/btrfs.8.in
> @@ -25,6 +25,8 @@ btrfs \- control a btrfs filesystem
>  [-s \fIstart\fR] [-t \fIsize\fR] -[vf] <\fIfile\fR>|<\fIdir\fR> \
>  [<\fIfile\fR>|<\fIdir\fR>...]
>  .PP
> +\fBbtrfs\fP \fBfilesystem df\fP [-r]\fI <path> \fP
> +.PP
>  \fBbtrfs\fP \fBfilesystem sync\fP\fI <path> \fP
>  .PP
>  \fBbtrfs\fP \fBfilesystem resize\fP\fI [devid:][+/\-]<size>[gkm]|[devid:]max <filesystem>\fP
> @@ -217,6 +219,13 @@ don't use it if you use snapshots, have de-duplicated your data or made
>  copies with \fBcp --reflink\fP.
>  .TP
>  
> +\fBfilesystem df\fR [-r] \fI<path>\fR
> +Show usage information for the filesystem identified by \fI<path>\fR.
> +
> +\fB-r, --raid\fP Use old-style "RAID-n" terminology to show replication types
> +
> +.TP
> +
>  \fBfilesystem sync\fR\fI <path> \fR
>  Force a sync for the filesystem identified by \fI<path>\fR.
>  .TP
> diff --git a/man/mkfs.btrfs.8.in b/man/mkfs.btrfs.8.in
> index 41163e0..2e71e65 100644
> --- a/man/mkfs.btrfs.8.in
> +++ b/man/mkfs.btrfs.8.in
> @@ -37,7 +37,29 @@ mkfs.btrfs uses all the available storage for the filesystem.
>  .TP
>  \fB\-d\fR, \fB\-\-data \fItype\fR
>  Specify how the data must be spanned across the devices specified. Valid
> -values are raid0, raid1, raid10 or single.
> +values are of the form <n>C[D][<m>S[<p>P]], where <n> is the number of copies
> +of data, <m> is the number of stripes per copy, and <p> is the number of parity
> +stripes. The <m> parameter must (currently) be a literal "m", indicating that
> +as many stripes as possible will be used. The letter D may be added to the
> +number of copies, to indicate non-redundant copies (e.g. on the same device).
> +
> +The following deprecated values may also be used:
> +.RS 16
> +.P
> +single	1C
> +.P
> +raid0	1CmS
> +.P
> +raid1	2C
> +.P
> +dup		2CD
> +.P
> +raid10	2CmS
> +.P
> +raid5	1CmS1P
> +.P
> +raid6	1CmS2P
> +.RS -16
>  .TP
>  \fB\-f\fR
>  Force overwrite when an existing filesystem is detected on the device.


-- 
gpg @keyserver.linux.it: Goffredo Baroncelli (kreijackATinwind.it>
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10 11:23 ` Martin Steigerwald
@ 2013-03-10 14:11   ` Goffredo Baroncelli
  2013-03-10 21:36   ` Hugo Mills
  2013-03-10 23:06   ` Diego Calleja
  2 siblings, 0 replies; 37+ messages in thread
From: Goffredo Baroncelli @ 2013-03-10 14:11 UTC (permalink / raw)
  To: Martin Steigerwald; +Cc: linux-btrfs

Hi Martin,

On 03/10/2013 12:23 PM, Martin Steigerwald wrote:
> Hi Hugo,
> 
> Am Samstag, 9. März 2013 schrieb Hugo Mills:
>>    Some time ago, and occasionally since, we've discussed altering the
>> "RAID-n" terminology to change it to an "nCmSpP" format, where n is the
>> number of copies, m is the number of (data) devices in a stripe per copy,
>> and p is the number of parity devices in a stripe.
>>
>>    The current kernel implementation uses as many devices as it can in
>> the striped modes (RAID-0, -10, -5, -6), and in this implementation,
>> that is written as "mS" (with a literal "m"). The mS and pP sections are
>> omitted if the value is 1S or 0P.
>>
>>    The magic look-up table for old-style / new-style is:
>>
>> single           1C (or omitted, in btrfs fi df output)
>> RAID-0           1CmS
>> RAID-1           2C
>> DUP                      2CD
> 
> What does the "D" in "2CD" mean? Its not explained above, unless I miss 
> something.

This means DUP (two copy on the same disk); I understand that only
reading the code.

> 
>
-- 
gpg @keyserver.linux.it: Goffredo Baroncelli (kreijackATinwind.it>
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-09 20:31 [PATCH 0/5] [RFC] RAID-level terminology change Hugo Mills
                   ` (8 preceding siblings ...)
  2013-03-10 11:23 ` Martin Steigerwald
@ 2013-03-10 15:43 ` Goffredo Baroncelli
  2013-03-10 22:24   ` Hugo Mills
  2013-03-10 23:40   ` sam tygier
  2013-03-10 23:55 ` sam tygier
  10 siblings, 2 replies; 37+ messages in thread
From: Goffredo Baroncelli @ 2013-03-10 15:43 UTC (permalink / raw)
  To: Hugo Mills; +Cc: linux-btrfs

Hi Hugo,

On 03/09/2013 09:31 PM, Hugo Mills wrote:
>    Some time ago, and occasionally since, we've discussed altering the
> "RAID-n" terminology to change it to an "nCmSpP" format, where n is the
> number of copies, m is the number of (data) devices in a stripe per copy,
> and p is the number of parity devices in a stripe.
> 
>    The current kernel implementation uses as many devices as it can in the
> striped modes (RAID-0, -10, -5, -6), and in this implementation, that is
> written as "mS" (with a literal "m"). The mS and pP sections are omitted
> if the value is 1S or 0P.
> 
>    The magic look-up table for old-style / new-style is:
> 
> single 		 1C (or omitted, in btrfs fi df output)
> RAID-0		 1CmS
> RAID-1		 2C
> DUP			 2CD
> RAID-10		 2CmS
> RAID-5		 1CmS1P
> RAID-6		 1CmS2P


Even I found a bit more rational the "nCmSpP" format, I think that this
is a bit too complex.

As you told:
>    Chris is definitely planning fixed values for mS (so you can ask
> for, say, exactly 4 stripes and one parity), and values for nC greater
> than 2. As far as I know, there aren't any plans for nC > 1 and pP > 0
> together. I haven't got far enough into the kernel code to work out
> whether that's simple or not to implement.

On the basis of that we should handle few cases than the full "nCDmSpP"
format allow. So I suggest to allow the following "shorts forms":

- DUP			-> dD		(to allow more that 2 copy per
					 disk)

- RAID1			-> nC or *C	

- RAID0 		-> mS or *S

- RAID10		-> nCmS or *CmS or nC*s

- RAID with parity	-> mSpP or *SpP or mS*p (it is possible ?)

- single		-> 1C or 1D or 1S or "single"


where d,n,m,p are integers; '*' is the literal '*' and means "how many
possible".

For example if I have 6 disks:
*C2S	-> means: 3 copies, 2 stripes ( m*n = 3*2 == num disk == 6)
2C*S	-> means: 2 copies, 3 stripes ( m*n = 3*2 == num disk == 6)
*S2P	-> means: 2 parity, 4 stripes ( p+m = 4+2 == num disk == 6)

We could also have some defaults, like: d=2, n=2, m=*, p=1, so some
common forms become:
D	-> means DUP (d==2)
S	-> means RAID0 (s=*)
C	-> means RAID1 (n=2)
CS	-> means RAID10 (n=2, m=*)
SP	-> means RAID5 (m=*, p=1)
SP2	-> means RAID6 (m=*, p=2)

It would allowable also more complex form like
	5C3D4S2P
but I don't think it will be supported ever.

I have a request: could we consider to swap letters with the numbers ?
For example 3S2P could become S3P2. For my *subjective* point of view,
this seems more clear. What do you think ?

However I am open also to the following forms

dD -> DUP(d), as backward compatibility DUP means d==2
nC -> RAID1(n), as backward compatibility RAID1 means n==2
mS -> RAID0(n), as backward compatibility RAID0 means m==*
nCmS -> RAID10(n,m), as backward compatibility RAID10 means n=2, m=*
mSpP -> RAIDP(n,m), as backward compatibility RAID6 means p=2, m=*
			RAID5 means p=1, m=*

more verbose but also more familiar to the administrator.

What do you think ?

-- 
gpg @keyserver.linux.it: Goffredo Baroncelli (kreijackATinwind.it>
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 5/5] Add man page description for nCmSpP replication levels
  2013-03-10 14:01   ` Goffredo Baroncelli
@ 2013-03-10 17:20     ` Hugo Mills
  2013-03-10 17:52       ` Goffredo Baroncelli
  0 siblings, 1 reply; 37+ messages in thread
From: Hugo Mills @ 2013-03-10 17:20 UTC (permalink / raw)
  To: kreijack; +Cc: linux-btrfs

[-- Attachment #1: Type: text/plain, Size: 1743 bytes --]

On Sun, Mar 10, 2013 at 03:01:12PM +0100, Goffredo Baroncelli wrote:
> Hi Hugo,
> 
> could you please add also to the btrfs man page a section where are
> described the nCmSpP levels ?
> 
> Thanks.
> GB
> 
> 
> 
> On 03/09/2013 09:31 PM, Hugo Mills wrote:
> > Signed-off-by: Hugo Mills <hugo@carfax.org.uk>
> > ---
[snip]
> > diff --git a/man/mkfs.btrfs.8.in b/man/mkfs.btrfs.8.in
> > index 41163e0..2e71e65 100644
> > --- a/man/mkfs.btrfs.8.in
> > +++ b/man/mkfs.btrfs.8.in
> > @@ -37,7 +37,29 @@ mkfs.btrfs uses all the available storage for the filesystem.
> >  .TP
> >  \fB\-d\fR, \fB\-\-data \fItype\fR
> >  Specify how the data must be spanned across the devices specified. Valid
> > -values are raid0, raid1, raid10 or single.

   Like here?

> > +values are of the form <n>C[D][<m>S[<p>P]], where <n> is the number of copies
> > +of data, <m> is the number of stripes per copy, and <p> is the number of parity
> > +stripes. The <m> parameter must (currently) be a literal "m", indicating that
> > +as many stripes as possible will be used. The letter D may be added to the
> > +number of copies, to indicate non-redundant copies (e.g. on the same device).
> > +
> > +The following deprecated values may also be used:
> > +.RS 16
> > +.P
> > +single	1C
> > +.P
> > +raid0	1CmS
> > +.P
> > +raid1	2C
> > +.P
> > +dup		2CD
> > +.P
> > +raid10	2CmS
> > +.P
> > +raid5	1CmS1P
> > +.P
> > +raid6	1CmS2P
> > +.RS -16
> >  .TP
> >  \fB\-f\fR
> >  Force overwrite when an existing filesystem is detected on the device.

-- 
=== Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk ===
  PGP key: 515C238D from wwwkeys.eu.pgp.net or http://www.carfax.org.uk
            --- I can resist everything except temptation ---            

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 5/5] Add man page description for nCmSpP replication levels
  2013-03-10 17:20     ` Hugo Mills
@ 2013-03-10 17:52       ` Goffredo Baroncelli
  0 siblings, 0 replies; 37+ messages in thread
From: Goffredo Baroncelli @ 2013-03-10 17:52 UTC (permalink / raw)
  To: Hugo Mills, kreijack, linux-btrfs

On 03/10/2013 06:20 PM, Hugo Mills wrote:
> On Sun, Mar 10, 2013 at 03:01:12PM +0100, Goffredo Baroncelli wrote:
>> Hi Hugo,
>>
>> could you please add also to the btrfs man page a section where are
>> described the nCmSpP levels ?
>>
>> Thanks.
>> GB
>>
>>
>>
>> On 03/09/2013 09:31 PM, Hugo Mills wrote:
>>> Signed-off-by: Hugo Mills <hugo@carfax.org.uk>
>>> ---
> [snip]
>>> diff --git a/man/mkfs.btrfs.8.in b/man/mkfs.btrfs.8.in
>>> index 41163e0..2e71e65 100644
>>> --- a/man/mkfs.btrfs.8.in
>>> +++ b/man/mkfs.btrfs.8.in
>>> @@ -37,7 +37,29 @@ mkfs.btrfs uses all the available storage for the filesystem.
>>>  .TP
>>>  \fB\-d\fR, \fB\-\-data \fItype\fR
>>>  Specify how the data must be spanned across the devices specified. Valid
>>> -values are raid0, raid1, raid10 or single.
> 
>    Like here?

Yes, I thought about a section which lists the allowable raid level.

> 
>>> +values are of the form <n>C[D][<m>S[<p>P]], where <n> is the number of copies
>>> +of data, <m> is the number of stripes per copy, and <p> is the number of parity
>>> +stripes. The <m> parameter must (currently) be a literal "m", indicating that
>>> +as many stripes as possible will be used. The letter D may be added to the
>>> +number of copies, to indicate non-redundant copies (e.g. on the same device).
>>> +
>>> +The following deprecated values may also be used:
>>> +.RS 16
>>> +.P
>>> +single	1C
>>> +.P
>>> +raid0	1CmS
>>> +.P
>>> +raid1	2C
>>> +.P
>>> +dup		2CD
>>> +.P
>>> +raid10	2CmS
>>> +.P
>>> +raid5	1CmS1P
>>> +.P
>>> +raid6	1CmS2P
>>> +.RS -16
>>>  .TP
>>>  \fB\-f\fR
>>>  Force overwrite when an existing filesystem is detected on the device.
> 


-- 
gpg @keyserver.linux.it: Goffredo Baroncelli (kreijackATinwind.it>
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10 11:23 ` Martin Steigerwald
  2013-03-10 14:11   ` Goffredo Baroncelli
@ 2013-03-10 21:36   ` Hugo Mills
  2013-03-10 21:45     ` Harald Glatt
  2013-03-10 23:06   ` Diego Calleja
  2 siblings, 1 reply; 37+ messages in thread
From: Hugo Mills @ 2013-03-10 21:36 UTC (permalink / raw)
  To: Martin Steigerwald; +Cc: linux-btrfs

[-- Attachment #1: Type: text/plain, Size: 2341 bytes --]

On Sun, Mar 10, 2013 at 12:23:56PM +0100, Martin Steigerwald wrote:
> Hi Hugo,
> 
> Am Samstag, 9. März 2013 schrieb Hugo Mills:
> >    Some time ago, and occasionally since, we've discussed altering the
> > "RAID-n" terminology to change it to an "nCmSpP" format, where n is the
> > number of copies, m is the number of (data) devices in a stripe per copy,
> > and p is the number of parity devices in a stripe.
> > 
> >    The current kernel implementation uses as many devices as it can in
> > the striped modes (RAID-0, -10, -5, -6), and in this implementation,
> > that is written as "mS" (with a literal "m"). The mS and pP sections are
> > omitted if the value is 1S or 0P.
> > 
> >    The magic look-up table for old-style / new-style is:
> > 
> > single           1C (or omitted, in btrfs fi df output)
> > RAID-0           1CmS
> > RAID-1           2C
> > DUP                      2CD
> 
> What does the "D" in "2CD" mean? Its not explained above, unless I miss 
> something.

   Oh, sorry. It's "reduced redundancy", aka DUP -- i.e. you get that
number of copies, but not guarantee that the copies all live on
different devices. I'm not devoted to showing it this way. Other
suggestions for making this distinction are welcomed. :)

> > RAID-10          2CmS
> > RAID-5           1CmS1P
> > RAID-6           1CmS2P
> 
> I think its great to clarify the RAID-level terminology, but I find the new 
> notation a bit, hmmm, cryptic.
> 
> Maybe for displaying it would be nice to show a more verbose format like
> 
> 2 copies, many stripes, 1 parity (1CmS1P)
> 
> by default and the abbreviated one in parentheses?

   The only place it gets output right now is btrfs fi df, and
something that verbose would probably get in the way.

> Any other idea to make it less cryptic?

   Not necessarily less cryptic, but using lower-case c/s/p would
probably improve readability: 1c2s1p. Possibly also using * instead of
m for the "s" setting: 2c*s. That will change the heights of the
characters, so you only really need to look at the tall ones.

   Hugo.

-- 
=== Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk ===
  PGP key: 65E74AC0 from wwwkeys.eu.pgp.net or http://www.carfax.org.uk
       --- There's many a slip 'twixt wicket-keeper and gully. ---       

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10 21:36   ` Hugo Mills
@ 2013-03-10 21:45     ` Harald Glatt
  2013-03-10 22:59       ` Goffredo Baroncelli
  0 siblings, 1 reply; 37+ messages in thread
From: Harald Glatt @ 2013-03-10 21:45 UTC (permalink / raw)
  To: Hugo Mills, Martin Steigerwald, linux-btrfs

On Sun, Mar 10, 2013 at 10:36 PM, Hugo Mills <hugo@carfax.org.uk> wrote:
>
>    Oh, sorry. It's "reduced redundancy", aka DUP -- i.e. you get that
> number of copies, but not guarantee that the copies all live on
> different devices. I'm not devoted to showing it this way. Other
> suggestions for making this distinction are welcomed. :)
>
>
>    Hugo.
>
> --
> === Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk ===
>   PGP key: 65E74AC0 from wwwkeys.eu.pgp.net or http://www.carfax.org.uk
>        --- There's many a slip 'twixt wicket-keeper and gully. ---

I've noticed through my own tests that on a single device I can
corrupt around 5% of the data completely before btrfs fails. Up to
that point both filesystem as well as data integrity stays at 100%.
However the default layout for one disk seems to be having the data
once, the metadata DUP and the system DUP too. Having these 5% isn't
mentioned anywhere... Is this a value that could maybe be manipulated
and could it be introduced into a naming scheme like this? Also where
do the 5% redundancy come from?

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10  5:41     ` Roger Binns
  2013-03-10  6:29       ` Harald Glatt
@ 2013-03-10 22:04       ` Hugo Mills
  2013-03-11  0:21         ` Roger Binns
  1 sibling, 1 reply; 37+ messages in thread
From: Hugo Mills @ 2013-03-10 22:04 UTC (permalink / raw)
  To: Roger Binns; +Cc: linux-btrfs

[-- Attachment #1: Type: text/plain, Size: 3591 bytes --]

On Sat, Mar 09, 2013 at 09:41:50PM -0800, Roger Binns wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> On 09/03/13 17:44, Hugo Mills wrote:
> > You've got at least three independent parameters to the system in order
> > to make that choice, though, and it's a fairly fuzzy decision problem.
> > You've got:
> > 
> > - Device redundancy - Storage overhead - Performance
> 
> Overhead and performance aren't separate goals.  More accurately the goal
> is best performance given the devices available and constrained by redundancy.
> 
> If I have 1GB of unique data and 10GB of underlying space available then
> feel free to make 9 additional copies of each piece of data if that helps
> performance.  As I increase the unique data the overhead available will
> decrease, but I doubt anyone has a goal of micromanaging overhead usage.
> Why can't the filesystem just figure it out and do the best job available
> given minimal constraints?
> 
> > I definitely want to report the results in nCmSpP form, which tells you
> > what it's actually done. The internal implementation, while not 
> > expressing the full gamut of possibilities, maps directly from the 
> > internal configuration to that form, and so it should at least be an 
> > allowable input for configuration (e.g. mkfs.btrfs and the restriper).
> 
> Agreed on that for the micromanagers :-)
> 
> > If you'd like to suggest a usable set of configuration axes [say, 
> > (redundancy, overhead) ], and a set of rules for converting those
> > requirements to the internal representation, then there's no reason we
> > can't add them as well in a later set of patches.
> 
> The only constraints that matter are surviving N device failures, and data
> not lost if at least N devices are still present.  Under the hood the best
> way of meeting those can be heuristically determined, and I'd expect
> things like overhead to dynamically adjust as storage fills up or empties.

   That's really not going to work happily -- you'd have to run the
restriper in the background automatically as the device fills up.
Given that this is going to end up rewriting *all* of the data on the
FS, taking up storage bandwidth as it does it (and taking a hell of a
long time to complete), I think this is a bit of a non-starter. Oh,
and your performance will drop steadily over the months as it cranks
down through the various options.

   If you want maximum storage (with some given redundancy),
regardless of performance, then you might as well start with the
parity-based levels and just leave it at that.

   Thinking about it, specifying a (redundancy, acceptable_wastage)
pair is fairly pointless in controlling the performance levels,
because it doesn't make the space/speed trade-off explicit. In fact,
without actually doing long-term benchmarks on your hardware and
workloads, it's going to be next-to-impossible to give any concrete
figures, other than "X will be faster than Y".

   So, if you (as $sysadmin) actually care about performance, you'll
have to benchmark your own system and test the various options anyway.
If you don't care about performance and want as much storage as
possible, you'll be using mS1P or mS2P (or possibly mS3P in the
future). There's not much else a heuristic can do, without effectively
exposing all the config options to the admin, in some obfuscated form.

   Hugo.

-- 
=== Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk ===
  PGP key: 515C238D from wwwkeys.eu.pgp.net or http://www.carfax.org.uk
       --- Great oxymorons of the world, no. 6: Mature Student ---       

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10 22:59       ` Goffredo Baroncelli
@ 2013-03-10 22:06         ` Harald Glatt
  0 siblings, 0 replies; 37+ messages in thread
From: Harald Glatt @ 2013-03-10 22:06 UTC (permalink / raw)
  To: kreijack; +Cc: Hugo Mills, Martin Steigerwald, linux-btrfs

I created a btrfs volume on a 4GB drive using the entire drive
(VirtualBox VM). Of this drive btrfs immediately used 400 MB. I then
filled it up with random data, left around 300 MB free and made a
md5sum of said data. Then I umounted the volume and wrote random data
into it the drive with dd at 1GB, 2GB and 3GB offsets. I increased the
amount of data written each time between my tests. Btrfs kept the
entire filesystem in tact (I did have to use btrfs-zero-log though)
and the checksum stayed correct the entire time. I wrote 120 MB of
corrupt data into the drive and that worked, on my next test writing
150 MB of corrupt data resulted in btrfs not being mountable anymore.
150 MB was around 5% of the 3.3 GB which was the size of my test file
how I got to the 5%. The output of btrfs when I couldn't mount anymore
said something about seeing a corruption larger than 145 MB which it
cannot recover from (heavily paraphrased) so I knew that my 150 MB
test was very close to the limit.

On Sun, Mar 10, 2013 at 11:59 PM, Goffredo Baroncelli
<kreijack@gmail.com> wrote:
> On 03/10/2013 10:45 PM, Harald Glatt wrote:
>
>> I've noticed through my own tests that on a single device I can
>> corrupt around 5% of the data completely before btrfs fails. Up to
>> that point both filesystem as well as data integrity stays at 100%.
>> However the default layout for one disk seems to be having the data
>> once, the metadata DUP and the system DUP too.
>
> How make you the corruption ? Does btrfs return wrong data ? How is
> calculated the 5% ?
>
>
>> Having these 5% isn't
>> mentioned anywhere... Is this a value that could maybe be manipulated
>> and could it be introduced into a naming scheme like this? Also where
>> do the 5% redundancy come from?
>
> On a single device, the metadata are DUPlicated but the data have only 1
> copy.
>
> This means that if you corrupt the 1 copy of the metadata, btrfs
> survives using the other copy. Instead if you corrupt the data btrfs
> return an error.
>
>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>
>
> --
> gpg @keyserver.linux.it: Goffredo Baroncelli (kreijackATinwind.it>
> Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10 15:43 ` Goffredo Baroncelli
@ 2013-03-10 22:24   ` Hugo Mills
  2013-03-10 22:42     ` Harald Glatt
  2013-03-10 23:40   ` sam tygier
  1 sibling, 1 reply; 37+ messages in thread
From: Hugo Mills @ 2013-03-10 22:24 UTC (permalink / raw)
  To: kreijack; +Cc: linux-btrfs

[-- Attachment #1: Type: text/plain, Size: 5475 bytes --]

On Sun, Mar 10, 2013 at 04:43:33PM +0100, Goffredo Baroncelli wrote:
> Hi Hugo,
> 
> On 03/09/2013 09:31 PM, Hugo Mills wrote:
> >    Some time ago, and occasionally since, we've discussed altering the
> > "RAID-n" terminology to change it to an "nCmSpP" format, where n is the
> > number of copies, m is the number of (data) devices in a stripe per copy,
> > and p is the number of parity devices in a stripe.
> > 
> >    The current kernel implementation uses as many devices as it can in the
> > striped modes (RAID-0, -10, -5, -6), and in this implementation, that is
> > written as "mS" (with a literal "m"). The mS and pP sections are omitted
> > if the value is 1S or 0P.
> > 
> >    The magic look-up table for old-style / new-style is:
> > 
> > single 		 1C (or omitted, in btrfs fi df output)
> > RAID-0		 1CmS
> > RAID-1		 2C
> > DUP			 2CD
> > RAID-10		 2CmS
> > RAID-5		 1CmS1P
> > RAID-6		 1CmS2P
> 
> 
> Even I found a bit more rational the "nCmSpP" format, I think that this
> is a bit too complex.
> 
> As you told:
> >    Chris is definitely planning fixed values for mS (so you can ask
> > for, say, exactly 4 stripes and one parity), and values for nC greater
> > than 2. As far as I know, there aren't any plans for nC > 1 and pP > 0
> > together. I haven't got far enough into the kernel code to work out
> > whether that's simple or not to implement.
> 
> On the basis of that we should handle few cases than the full "nCDmSpP"
> format allow. So I suggest to allow the following "shorts forms":
> 
> - DUP			-> dD		(to allow more that 2 copy per
> 					 disk)
> 
> - RAID1			-> nC or *C	
> 
> - RAID0 		-> mS or *S

   So we can drop the C clause where it's 1C, in the same way we drop
the S clause when it's 1S, and P when it's 0P. I'm happy with that
much. It makes the parser a little more complicated, but it's no big
problem.

> - RAID10		-> nCmS or *CmS or nC*s
> 
> - RAID with parity	-> mSpP or *SpP or mS*p (it is possible ?)
> 
> - single		-> 1C or 1D or 1S or "single"
> 
> 
> where d,n,m,p are integers; '*' is the literal '*' and means "how many
> possible".

   I don't particularly like this as a generic thing. The * is
ambiguous: does it mean "use as many as possible at all times" (like
the current RAID-0 implementation, for example), or does it mean "use
as many as we can right now, and don't reduce the value"?

   The former is the default for stripes on current RAID-0, -10, -5
and -6. However, it would be problematic for copies on RAID-1, and
-10, because you'd have variable redundancy. For specifying copies,
you'd want the second meaning above, so the * actually means subtly
different things depending on where it appears.

> For example if I have 6 disks:
> *C2S	-> means: 3 copies, 2 stripes ( m*n = 3*2 == num disk == 6)
> 2C*S	-> means: 2 copies, 3 stripes ( m*n = 3*2 == num disk == 6)
> 
> *S2P	-> means: 2 parity, 4 stripes ( p+m = 4+2 == num disk == 6)

> We could also have some defaults, like: d=2, n=2, m=*, p=1, so some
> common forms become:
> D	-> means DUP (d==2)
> S	-> means RAID0 (s=*)
> C	-> means RAID1 (n=2)
> CS	-> means RAID10 (n=2, m=*)
> SP	-> means RAID5 (m=*, p=1)
> SP2	-> means RAID6 (m=*, p=2)

   Too cryptic; special cases -- we'd only be replacing one set of
semantic-free symbols (RAID-18-OMGBBQ) with another set, which is what
I'm trying to get away from with this patch.

   I definitely want to see <number><specifier> clauses only, in a
fixed order and with as little special-casing as possible. We can drop
"unused" clauses, but trying to remove just the numbers for specific
cases, or just to produce new special names and aliases for things
seems counterproductive.

> It would allowable also more complex form like
> 	5C3D4S2P
> but I don't think it will be supported ever.

   Probably not. :)

   It's unclear to me what the 5C3D would actually do. (5 copies on
different drives, and three other copies placed somewhere on the same
devices? And as you say, unlikely to be implemented) I think the "D"
concept should stay as a modifier flag, not a separate clause, as
we've got either n copies all on different drives, or n copies without
the guarantee, which is what the D shows.

   (As an aside, when Chris gives us 3C, it'll be interesting to see
what can be done with 3CD on two drives... TRIPE rather than DUP?)

> I have a request: could we consider to swap letters with the numbers ?
> For example 3S2P could become S3P2. For my *subjective* point of view,
> this seems more clear. What do you think ?

   I read it as "three stripes, two parity", so to me it makes more
sense in <number><letter> order.

> However I am open also to the following forms
> 
> dD -> DUP(d), as backward compatibility DUP means d==2
> nC -> RAID1(n), as backward compatibility RAID1 means n==2
> mS -> RAID0(n), as backward compatibility RAID0 means m==*
> nCmS -> RAID10(n,m), as backward compatibility RAID10 means n=2, m=*
> mSpP -> RAIDP(n,m), as backward compatibility RAID6 means p=2, m=*
> 			RAID5 means p=1, m=*
> 
> more verbose but also more familiar to the administrator.

   I don't particularly want to encourage the re-use or extension of
the RAID-n notation, because what we're doing here is, largely, *not*
the standardised RAID levels.

   Hugo.

-- 
=== Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk ===
  PGP key: 515C238D from wwwkeys.eu.pgp.net or http://www.carfax.org.uk
       --- Great oxymorons of the world, no. 6: Mature Student ---       

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10 22:24   ` Hugo Mills
@ 2013-03-10 22:42     ` Harald Glatt
  0 siblings, 0 replies; 37+ messages in thread
From: Harald Glatt @ 2013-03-10 22:42 UTC (permalink / raw)
  To: Hugo Mills, kreijack, linux-btrfs

On Sun, Mar 10, 2013 at 11:24 PM, Hugo Mills <hugo@carfax.org.uk> wrote:
>> On 03/09/2013 09:31 PM, Hugo Mills wrote:
>> >    Some time ago, and occasionally since, we've discussed altering the
>> > "RAID-n" terminology to change it to an "nCmSpP" format, where n is the
>> > number of copies, m is the number of (data) devices in a stripe per copy,
>> > and p is the number of parity devices in a stripe.
>> >
>
>    Hugo.
>
> --
> === Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk ===
>   PGP key: 515C238D from wwwkeys.eu.pgp.net or http://www.carfax.org.uk
>        --- Great oxymorons of the world, no. 6: Mature Student ---

It's important that the userland tools will output the things in both
a 'human readable' format as well as the short forms. I would say when
giving btrfs a parameter, it should only accept the short forms. But
in order to allow users to figure out what they actually mean when
doing something like 'btrfs fi show' it should tell you sometihng by
the lines of  '1 copy, 3 stripes, 1 parity (1C3S1P)' instead of just
the short form.

I would also make the case that leaving out 'defaults' from the output
is bad for the learning curve as well. Even when it's just 1C I
wouldn't remove that from the output, but the input parameter when you
work with it should know what you mean when you leave 1C out, of
course, and not require it.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10 21:45     ` Harald Glatt
@ 2013-03-10 22:59       ` Goffredo Baroncelli
  2013-03-10 22:06         ` Harald Glatt
  0 siblings, 1 reply; 37+ messages in thread
From: Goffredo Baroncelli @ 2013-03-10 22:59 UTC (permalink / raw)
  To: Harald Glatt; +Cc: Hugo Mills, Martin Steigerwald, linux-btrfs

On 03/10/2013 10:45 PM, Harald Glatt wrote:

> I've noticed through my own tests that on a single device I can
> corrupt around 5% of the data completely before btrfs fails. Up to
> that point both filesystem as well as data integrity stays at 100%.
> However the default layout for one disk seems to be having the data
> once, the metadata DUP and the system DUP too. 

How make you the corruption ? Does btrfs return wrong data ? How is
calculated the 5% ?


> Having these 5% isn't
> mentioned anywhere... Is this a value that could maybe be manipulated
> and could it be introduced into a naming scheme like this? Also where
> do the 5% redundancy come from?

On a single device, the metadata are DUPlicated but the data have only 1
copy.

This means that if you corrupt the 1 copy of the metadata, btrfs
survives using the other copy. Instead if you corrupt the data btrfs
return an error.


> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


-- 
gpg @keyserver.linux.it: Goffredo Baroncelli (kreijackATinwind.it>
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10 11:23 ` Martin Steigerwald
  2013-03-10 14:11   ` Goffredo Baroncelli
  2013-03-10 21:36   ` Hugo Mills
@ 2013-03-10 23:06   ` Diego Calleja
  2 siblings, 0 replies; 37+ messages in thread
From: Diego Calleja @ 2013-03-10 23:06 UTC (permalink / raw)
  To: Martin Steigerwald; +Cc: linux-btrfs, Hugo Mills

El Domingo, 10 de marzo de 2013 12:23:56 Martin Steigerwald escribió:
> Any other idea to make it less cryptic?

I would vote for optionally allowing to expand the codes into
something more verbose and self-documented, ie:

1CmS1P <-> 1Copy-manyStripes-1Parity

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10 15:43 ` Goffredo Baroncelli
  2013-03-10 22:24   ` Hugo Mills
@ 2013-03-10 23:40   ` sam tygier
  2013-03-10 23:49     ` Hugo Mills
  1 sibling, 1 reply; 37+ messages in thread
From: sam tygier @ 2013-03-10 23:40 UTC (permalink / raw)
  To: linux-btrfs

On 10/03/13 15:43, Goffredo Baroncelli wrote:
> - DUP			-> dD		(to allow more that 2 copy per
> 					 disk)
> 
> - RAID1			-> nC or *C	
> 
> - RAID0 		-> mS or *S
> 
> - RAID10		-> nCmS or *CmS or nC*s
> 
> - RAID with parity	-> mSpP or *SpP or mS*p (it is possible ?)
> 
> - single		-> 1C or 1D or 1S or "single"
> 
> 
> where d,n,m,p are integers; '*' is the literal '*' and means "how many
> possible".

Using an asterisk '*' in something will be used as a command line argument risks having the shell expand it. Sticking to pure alphanumeric names would be better.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10 23:40   ` sam tygier
@ 2013-03-10 23:49     ` Hugo Mills
  2013-03-11 14:14       ` David Sterba
  0 siblings, 1 reply; 37+ messages in thread
From: Hugo Mills @ 2013-03-10 23:49 UTC (permalink / raw)
  To: sam tygier; +Cc: linux-btrfs

[-- Attachment #1: Type: text/plain, Size: 1215 bytes --]

On Sun, Mar 10, 2013 at 11:40:27PM +0000, sam tygier wrote:
> On 10/03/13 15:43, Goffredo Baroncelli wrote:
> > - DUP			-> dD		(to allow more that 2 copy per
> > 					 disk)
> > 
> > - RAID1			-> nC or *C	
> > 
> > - RAID0 		-> mS or *S
> > 
> > - RAID10		-> nCmS or *CmS or nC*s
> > 
> > - RAID with parity	-> mSpP or *SpP or mS*p (it is possible ?)
> > 
> > - single		-> 1C or 1D or 1S or "single"
> > 
> > 
> > where d,n,m,p are integers; '*' is the literal '*' and means "how many
> > possible".
> 
> Using an asterisk '*' in something will be used as a command line argument risks having the shell expand it. Sticking to pure alphanumeric names would be better.

   Yeah, David's just pointed this out on IRC. After a bit of fiddling
around with various options, I like using X.

   I'm also going to use lowercase c,s,p, because it seems to be
easier to read with the different-height characters. So we end up
with, e.g.

1c      (single)
2cXs    (RAID-10)
1cXs2p  (RAID-6)

   Hugo.

-- 
=== Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk ===
  PGP key: 65E74AC0 from wwwkeys.eu.pgp.net or http://www.carfax.org.uk
       --- It's against my programming to impersonate a deity! ---       

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-09 20:31 [PATCH 0/5] [RFC] RAID-level terminology change Hugo Mills
                   ` (9 preceding siblings ...)
  2013-03-10 15:43 ` Goffredo Baroncelli
@ 2013-03-10 23:55 ` sam tygier
  2013-03-11  8:56   ` Hugo Mills
  10 siblings, 1 reply; 37+ messages in thread
From: sam tygier @ 2013-03-10 23:55 UTC (permalink / raw)
  To: linux-btrfs

On 09/03/13 20:31, Hugo Mills wrote:
>    Some time ago, and occasionally since, we've discussed altering the
> "RAID-n" terminology to change it to an "nCmSpP" format, where n is the
> number of copies, m is the number of (data) devices in a stripe per copy,
> and p is the number of parity devices in a stripe.
> 
>    The current kernel implementation uses as many devices as it can in the
> striped modes (RAID-0, -10, -5, -6), and in this implementation, that is
> written as "mS" (with a literal "m"). The mS and pP sections are omitted
> if the value is 1S or 0P.
> 
>    The magic look-up table for old-style / new-style is:
> 
> single 		 1C (or omitted, in btrfs fi df output)
> RAID-0		 1CmS
> RAID-1		 2C
> DUP			 2CD
> RAID-10		 2CmS
> RAID-5		 1CmS1P
> RAID-6		 1CmS2P

Are these the only valid options? Are 'sensible' new levels (eg 3C, mirrored to 3 disk or 1CmS3P, like raid6 with but with 3 parity blocks) allowed? Are any arbitrary levels allowed (some other comments in the thread suggest no)? Will there be a recommended (or supported) set?




^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10 22:04       ` Hugo Mills
@ 2013-03-11  0:21         ` Roger Binns
  2013-03-27  4:27           ` Brendan Hide
  0 siblings, 1 reply; 37+ messages in thread
From: Roger Binns @ 2013-03-11  0:21 UTC (permalink / raw)
  To: linux-btrfs

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 10/03/13 15:04, Hugo Mills wrote:
> On Sat, Mar 09, 2013 at 09:41:50PM -0800, Roger Binns wrote:
>> The only constraints that matter are surviving N device failures, and
>> data not lost if at least N devices are still present.  Under the
>> hood the best way of meeting those can be heuristically determined,
>> and I'd expect things like overhead to dynamically adjust as storage
>> fills up or empties.
> 
> That's really not going to work happily -- you'd have to run the 
> restriper in the background automatically as the device fills up.

Which is the better approach - the administrator has to sit there
adjusting various parameters after having done some difficult calculations
redoing it as data and devices increase or decrease - or a computer with
billions of bytes of memory and billions of cpu cycles per second just
figures it out based on experience :-)

> Given that this is going to end up rewriting *all* of the data on the 
> FS,

Why does all data have to be rewritten?  Why does every piece of data have
to have exactly the same storage parameters in terms of
non-redundancy/performance/striping options?

I can easily imagine the final implementation being informed by hot data
tracking.  There is absolutely no need for data that is rarely read to be
using the maximum striping/performance/overhead options.

There is no need to rewrite everything anyway - if a filesystem with 1GB
of data is heading towards 2GB of data then only enough readjusts need to
be made to release that additional 1GB of overhead.

I also assume that the probability of all devices being exactly the same
size and exactly the same performance characteristics is going to
decrease.  Many will expect that they can add an SSD to the soup, and over
time add/update devices.  ie the homogenous case that regular RAID
implicitly assumes will become increasingly rare.

> If you want maximum storage (with some given redundancy), regardless of
> performance, then you might as well start with the parity-based levels
> and just leave it at that.

In the short term it would certainly make sense to have an online
calculator or mkfs helper where you specify the device sizes and
redundancy requirements together with how much data you have, and it then
spits out the string of numbers and letters to use for mkfs/balance.

> Thinking about it, specifying a (redundancy, acceptable_wastage) pair
> is fairly pointless in controlling the performance levels,

I don't think there is merit in specifying acceptable message - the answer
is obvious in that any unused space is acceptable for use.  That also
means it changes over time as storage is used/freed.

> There's not much else a heuristic can do, without effectively exposing
> all the config options to the admin, in some obfuscated form.

There is lots heuristics can do.  At the simplest level btrfs can monitor
device performance characteristics and use that as a first pass.  One
database that I use has an interesting approach for queries - rather than
trying to work out the single best perfect execution strategy (eg which
indices in which order) it actually tries them all out concurrently and
picks the quickest.  That is then used for future similar queries with the
performance being monitored.  Once responses times no longer match the
strategy it tries them all again to pick a new winner.

There is no reason btrfs can't try a similar approach.  When presented
with a pile of heterogenous storage with different sizes and performance
characteristics, use all reasonable approaches and monitor resulting
read/write performance.  Then start biasing towards what works best.  Use
hot data tracking to determine which data would most benefit from its
approach being changed to more optimal values.

Roger
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.11 (GNU/Linux)

iEYEARECAAYFAlE9I4UACgkQmOOfHg372QTNZgCeJe7H9FDiwMq1CWWZTWE89/4O
fDsAn1s6/J1am4mxHhOYUnz/3JUZ6VJx
=/XF8
-----END PGP SIGNATURE-----


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10 23:55 ` sam tygier
@ 2013-03-11  8:56   ` Hugo Mills
  0 siblings, 0 replies; 37+ messages in thread
From: Hugo Mills @ 2013-03-11  8:56 UTC (permalink / raw)
  To: sam tygier; +Cc: linux-btrfs

[-- Attachment #1: Type: text/plain, Size: 2296 bytes --]

On Sun, Mar 10, 2013 at 11:55:10PM +0000, sam tygier wrote:
> On 09/03/13 20:31, Hugo Mills wrote:
> >    Some time ago, and occasionally since, we've discussed altering the
> > "RAID-n" terminology to change it to an "nCmSpP" format, where n is the
> > number of copies, m is the number of (data) devices in a stripe per copy,
> > and p is the number of parity devices in a stripe.
> > 
> >    The current kernel implementation uses as many devices as it can in the
> > striped modes (RAID-0, -10, -5, -6), and in this implementation, that is
> > written as "mS" (with a literal "m"). The mS and pP sections are omitted
> > if the value is 1S or 0P.
> > 
> >    The magic look-up table for old-style / new-style is:
> > 
> > single 		 1C (or omitted, in btrfs fi df output)
> > RAID-0		 1CmS
> > RAID-1		 2C
> > DUP			 2CD
> > RAID-10		 2CmS
> > RAID-5		 1CmS1P
> > RAID-6		 1CmS2P
> 
> Are these the only valid options?

   Currently, yes.

> Are 'sensible' new levels (eg 3C, mirrored to 3 disk or 1CmS3P, like
> raid6 with but with 3 parity blocks) allowed?

   Not right now, but:

 - I don't know if the forthcoming 3c code will allow arbitrary values
   or not, but Chris has promised 3c.

 - Fixed S will definitely happen for the parity-RAID levels. I'm not
   sure about the stripe-RAID levels.

 - Higher P are mathematically possible, but (AIUI) awkward to
   construct efficient and effective ones (and it's a manual process
   to do so). I suspect that 3p may happen, but 4p may not for a long
   time.

> Are any arbitrary levels allowed (some other comments in the thread
> suggest no)?

   Currently, no, and I don't think there are immediate plans to
generalise it, but I'd like to see that happen eventually.

> Will there be a recommended (or supported) set?

   Quite likely, even with the limited (forthcoming) set of
parameters. Using mSpP on an array of larger than some particular size
is probably not going to be particularly good for performance, for
example.

   Hugo.

-- 
=== Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk ===
  PGP key: 65E74AC0 from wwwkeys.eu.pgp.net or http://www.carfax.org.uk
   --- You stay in the theatre because you're afraid of having no ---    
                         money? There's irony...                         

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-10 23:49     ` Hugo Mills
@ 2013-03-11 14:14       ` David Sterba
  0 siblings, 0 replies; 37+ messages in thread
From: David Sterba @ 2013-03-11 14:14 UTC (permalink / raw)
  To: Hugo Mills, sam tygier, linux-btrfs

On Sun, Mar 10, 2013 at 11:49:53PM +0000, Hugo Mills wrote:
> > Using an asterisk '*' in something will be used as a command line argument risks having the shell expand it. Sticking to pure alphanumeric names would be better.
> 
>    Yeah, David's just pointed this out on IRC. After a bit of fiddling
> around with various options, I like using X.

I'd like to see something that can exist as na identifier or can be
copy-pasted in one click, but '*' being a shell meta-character is IMO
stronger argument against using it.

>    I'm also going to use lowercase c,s,p, because it seems to be
> easier to read with the different-height characters. So we end up
> with, e.g.
> 
> 1c      (single)
> 2cXs    (RAID-10)
> 1cXs2p  (RAID-6)

This form looks ok to me.

david

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-11  0:21         ` Roger Binns
@ 2013-03-27  4:27           ` Brendan Hide
  2013-03-27  5:24             ` Roger Binns
  0 siblings, 1 reply; 37+ messages in thread
From: Brendan Hide @ 2013-03-27  4:27 UTC (permalink / raw)
  To: Roger Binns; +Cc: linux-btrfs

Bit late - but that could be explored in future. The main downside I see 
with "automatic" redundancy/optimisation is the complexity it 
introduces. Likely this would be best served with user-space tools.

On 11/03/13 02:21, Roger Binns wrote:
> On 10/03/13 15:04, Hugo Mills wrote:
>> Given that this is going to end up rewriting *all* of the data on the
>> FS,
> Why does all data have to be rewritten?  Why does every piece of data have
> to have exactly the same storage parameters in terms of
> non-redundancy/performance/striping options?
This is a good point. You don't necessarily have to rewrite everything 
all at once so the performance penalty is not necessarily that bad. More 
importantly, some "restripe" operations actually don't need much change 
on-disk (in theory).

Let's say we have disks with 3c chunk allocation and this needs to be 
reallocated into 2c chunks.

In practice at present what would actually happen is that it would first 
create a *new* 2c chunk and migrate the data over from the 3c chunk. 
Once the data is moved across we finally mark the space taken up by the 
original 3c chunk as available for use. Rinse; Repeat.

In theory we can skip this rebalance/migration step by "thinning" out 
the chunks in-place: relabel the chunk as 2c and mark the unneeded 
copies as available diskspace.

A similar situation applies to other types of "conversions" in that they 
could be converted in-place with much less I/O or that the I/O could be 
optimised (for example sequential I/O between disks with minimal 
buffering needed vs moving data between two locations on the same disk). 
I'm sure there are other possibilities for "in-place" conversions too, 
such as moving from 4c to 2c2s or 2c to 2s.

xC -> (x-1)C
xCmS -> (x/2)C(m*2)S

The complexity of the different types of conversions hasn't escaped me, 
and I do see another downside as well. With the 3C->2C conversion there 
is the inevitability of "macro" fragmentation. Again, there could be 
long-term performance implication or it might even be negligible.

-- 
__________
Brendan Hide
http://swiftspirit.co.za/
http://www.webafrica.co.za/?AFF1E97


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/5] [RFC] RAID-level terminology change
  2013-03-27  4:27           ` Brendan Hide
@ 2013-03-27  5:24             ` Roger Binns
  0 siblings, 0 replies; 37+ messages in thread
From: Roger Binns @ 2013-03-27  5:24 UTC (permalink / raw)
  To: linux-btrfs

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On 26/03/13 21:27, Brendan Hide wrote:

> On 11/03/13 02:21, Roger Binns wrote:
>> Why does all data have to be rewritten?  Why does every piece of data
>> have to have exactly the same storage parameters in terms of 
>> non-redundancy/performance/striping options?

> This is a good point. You don't necessarily have to rewrite everything
> all at once so the performance penalty is not necessarily that bad.
> More importantly, some "restripe" operations actually don't need much
> change on-disk (in theory).

Note that is not what I was describing.  What I meant was that if I put
10GB of data onto 100GB of space that btrfs is free to go above and beyond
the minimums, and to do so differently for different pieces of data.  For
example  btrfs could make 6 copies of files beginning with 'a', 10 of
files beginning with 'c' and 274 of all others.  Obviously that is a bad
heuristic, but anything it deems useful for all that unused space is fine
by me, and there is absolutely no need for every block to have exactly the
same parameters all others.

Roger



-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.11 (GNU/Linux)

iEYEARECAAYFAlFSgqcACgkQmOOfHg372QRIWwCgs/1ou96E5S0d93XEcAnIDvTd
f08AoNn6F4zjfQSzXAnkZk4RS4KWZq0b
=+kL4
-----END PGP SIGNATURE-----


^ permalink raw reply	[flat|nested] 37+ messages in thread

end of thread, other threads:[~2013-03-27  5:25 UTC | newest]

Thread overview: 37+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-03-09 20:31 [PATCH 0/5] [RFC] RAID-level terminology change Hugo Mills
2013-03-09 20:31 ` [PATCH 1/5] Use nCmSpP format for mkfs Hugo Mills
2013-03-09 20:31 ` [PATCH 2/5] Move parse_profile to utils.c Hugo Mills
2013-03-09 20:31 ` [PATCH 3/5] Convert balance filter parser to use common nCmSpP replication-level parser Hugo Mills
2013-03-09 20:31 ` [PATCH 4/5] Change output of btrfs fi df to report new (or old) RAID names Hugo Mills
2013-03-09 20:31 ` [PATCH 5/5] Add man page description for nCmSpP replication levels Hugo Mills
2013-03-10 14:01   ` Goffredo Baroncelli
2013-03-10 17:20     ` Hugo Mills
2013-03-10 17:52       ` Goffredo Baroncelli
2013-03-09 21:38 ` [PATCH 0/5] [RFC] RAID-level terminology change Harald Glatt
     [not found] ` <CAFWF=am4ki529Zez4123gYk3BD+Z9RONRpAK7NZe=skHzcdMiw@mail.gmail.com>
2013-03-09 21:46   ` Hugo Mills
2013-03-09 22:25 ` Roger Binns
2013-03-10  1:44   ` Hugo Mills
2013-03-10  5:41     ` Roger Binns
2013-03-10  6:29       ` Harald Glatt
2013-03-10  6:37         ` Harald Glatt
2013-03-10 11:31           ` Martin Steigerwald
2013-03-10 11:48           ` Roger Binns
2013-03-10 22:04       ` Hugo Mills
2013-03-11  0:21         ` Roger Binns
2013-03-27  4:27           ` Brendan Hide
2013-03-27  5:24             ` Roger Binns
2013-03-10 11:23 ` Martin Steigerwald
2013-03-10 14:11   ` Goffredo Baroncelli
2013-03-10 21:36   ` Hugo Mills
2013-03-10 21:45     ` Harald Glatt
2013-03-10 22:59       ` Goffredo Baroncelli
2013-03-10 22:06         ` Harald Glatt
2013-03-10 23:06   ` Diego Calleja
2013-03-10 15:43 ` Goffredo Baroncelli
2013-03-10 22:24   ` Hugo Mills
2013-03-10 22:42     ` Harald Glatt
2013-03-10 23:40   ` sam tygier
2013-03-10 23:49     ` Hugo Mills
2013-03-11 14:14       ` David Sterba
2013-03-10 23:55 ` sam tygier
2013-03-11  8:56   ` Hugo Mills

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.