All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/7] mdadm support for Partial Parity Log
@ 2016-11-24 12:29 Artur Paszkiewicz
  2016-11-24 12:29 ` [PATCH 1/7] imsm: metadata changes for PPL Artur Paszkiewicz
                   ` (6 more replies)
  0 siblings, 7 replies; 17+ messages in thread
From: Artur Paszkiewicz @ 2016-11-24 12:29 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid, Artur Paszkiewicz

This is the mdadm part of the PPL functionality. It adds a new parameter to
mdadm to allow selecting the RWH policy for an array. Other changes include
displaying the RWH policy in the output from --detail and --examine.

As with the kernel patches sent earlier, all of this is currently targeted and
tested with IMSM and native MD v1.1 and v1.2 metadata arrays.

Artur Paszkiewicz (6):
  imsm: metadata changes for PPL
  Generic support for --rwh-policy and PPL
  imsm: PPL support
  super1: PPL support
  Allow changing the RWH policy for a running array
  Man page changes for --rwh-policy

Pawel Baldysiak (1):
  imsm: allow to assemble with PPL even if dirty degraded

 Assemble.c    |   4 +-
 Create.c      |  26 ++++++++--
 Detail.c      |  18 ++++++-
 Grow.c        |  15 +++++-
 Kill.c        |   2 +-
 Manage.c      |  79 ++++++++++++++++++++++++++++
 ReadMe.c      |   1 +
 maps.c        |   7 +++
 mdadm.8.in    |  28 ++++++++++
 mdadm.c       |  44 ++++++++++++++--
 mdadm.h       |  21 ++++++--
 super-ddf.c   |   6 +--
 super-gpt.c   |   2 +-
 super-intel.c | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++++------
 super-mbr.c   |   2 +-
 super0.c      |   8 +--
 super1.c      |  92 +++++++++++++++++++++++++++------
 sysfs.c       |  15 ++++++
 18 files changed, 475 insertions(+), 56 deletions(-)

-- 
2.10.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/7] imsm: metadata changes for PPL
  2016-11-24 12:29 [PATCH 0/7] mdadm support for Partial Parity Log Artur Paszkiewicz
@ 2016-11-24 12:29 ` Artur Paszkiewicz
  2016-11-28 23:27   ` Jes Sorensen
  2016-11-24 12:29 ` [PATCH 2/7] Generic support for --rwh-policy and PPL Artur Paszkiewicz
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 17+ messages in thread
From: Artur Paszkiewicz @ 2016-11-24 12:29 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid, Artur Paszkiewicz

Updates for the IMSM metadata format, including PPL header structures.

Extend imsm_vol dirty field adding a third value, which is required to
enable PPL recovery in Windows and UEFI drivers.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 super-intel.c | 48 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/super-intel.c b/super-intel.c
index 5740088..69f6201 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -104,8 +104,11 @@ struct imsm_disk {
 	__u32 status;			 /* 0xF0 - 0xF3 */
 	__u32 owner_cfg_num; /* which config 0,1,2... owns this disk */
 	__u32 total_blocks_hi;		 /* 0xF4 - 0xF5 total blocks hi */
-#define	IMSM_DISK_FILLERS	3
-	__u32 filler[IMSM_DISK_FILLERS]; /* 0xF5 - 0x107 MPB_DISK_FILLERS for future expansion */
+	__u8 is_journal_disk;
+	__u8 filler1; /* MPB_DISK_FILLERS - reserved for future expansion */
+	__u16 filler2;
+#define	IMSM_DISK_FILLERS	2
+	__u32 filler[IMSM_DISK_FILLERS];
 };
 
 /* map selector for map managment
@@ -154,6 +157,9 @@ struct imsm_vol {
 #define MIGR_STATE_CHANGE 4
 #define MIGR_REPAIR 5
 	__u8  migr_type;	/* Initializing, Rebuilding, ... */
+#define RAIDVOL_CLEAN          0
+#define RAIDVOL_DIRTY          1
+#define RAIDVOL_DSRECORD_VALID 2
 	__u8  dirty;
 	__u8  fs_state;		/* fast-sync state for CnG (0xff == disabled) */
 	__u16 verify_errors;	/* number of mismatches */
@@ -189,7 +195,30 @@ struct imsm_dev {
 	__u16 cache_policy;
 	__u8  cng_state;
 	__u8  cng_sub_state;
-#define IMSM_DEV_FILLERS 10
+	__u16 my_vol_raid_dev_num; /* Used in Unique volume Id for this RaidDev */
+
+	/* NVM_EN */
+	__u8 nv_cache_mode;
+	union {
+		__u8 nv_cache_flags;
+		struct {
+		    __u8 dirty:1; /* 1 - cache is dirty, 0 - clean */
+		    __u8 nvc_health:2;
+		    __u8 expansion_bytes:5;
+		} nvCache;
+	};
+
+	/* Unique Volume Id of the NvCache Volume associated with this volume */
+	__u32 nvc_vol_orig_family_num;
+	__u16 nvc_vol_raid_dev_num;
+
+#define RWH_OFF 0
+#define RWH_DISTRIBUTED 1
+#define RWH_JOURNALING_DRIVE 2
+	__u8  rwh_policy;              /* Raid Write Hole Policy */
+	__u8  filler1;
+
+#define IMSM_DEV_FILLERS 7
 	__u32 filler[IMSM_DEV_FILLERS];
 	struct imsm_vol vol;
 } __attribute__ ((packed));
@@ -7565,12 +7594,15 @@ mark_checkpoint:
 
 skip_mark_checkpoint:
 	/* mark dirty / clean */
-	if (dev->vol.dirty != !consistent) {
+	if ((dev->vol.dirty & RAIDVOL_DIRTY) != !consistent) {
 		dprintf("imsm: mark '%s'\n", consistent ? "clean" : "dirty");
-		if (consistent)
-			dev->vol.dirty = 0;
-		else
-			dev->vol.dirty = 1;
+		if (consistent) {
+			dev->vol.dirty = RAIDVOL_CLEAN;
+		} else {
+			dev->vol.dirty = RAIDVOL_DIRTY;
+			if (dev->rwh_policy)
+				dev->vol.dirty |= RAIDVOL_DSRECORD_VALID;
+		}
 		super->updates_pending++;
 	}
 
-- 
2.10.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/7] Generic support for --rwh-policy and PPL
  2016-11-24 12:29 [PATCH 0/7] mdadm support for Partial Parity Log Artur Paszkiewicz
  2016-11-24 12:29 ` [PATCH 1/7] imsm: metadata changes for PPL Artur Paszkiewicz
@ 2016-11-24 12:29 ` Artur Paszkiewicz
  2016-11-24 12:29 ` [PATCH 3/7] imsm: PPL support Artur Paszkiewicz
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 17+ messages in thread
From: Artur Paszkiewicz @ 2016-11-24 12:29 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid, Artur Paszkiewicz

Add a new parameter to mdadm: --rwh-policy=. It can be used to create a
raid5 array using PPL. Add the necessary plumbing to pass this option to
metadata handlers. The write journal functionality is treated as a
different RWH policy, which is implicitly selected when using
--write-journal.

Show the currently enabled RWH policy type in the output from
mdadm --detail. The value is retrieved from the array's sysfs attribute
'rwh_policy'.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 Create.c      |  7 ++++++-
 Detail.c      | 18 ++++++++++++++++--
 Kill.c        |  2 +-
 ReadMe.c      |  1 +
 maps.c        |  7 +++++++
 mdadm.c       | 35 ++++++++++++++++++++++++++++++++---
 mdadm.h       | 15 +++++++++++++--
 super-ddf.c   |  4 ++--
 super-intel.c | 12 ++++++------
 super0.c      |  6 +++---
 super1.c      |  4 ++--
 sysfs.c       | 11 +++++++++++
 12 files changed, 100 insertions(+), 22 deletions(-)

diff --git a/Create.c b/Create.c
index 1594a39..52e7e2b 100644
--- a/Create.c
+++ b/Create.c
@@ -594,6 +594,11 @@ int Create(struct supertype *st, char *mddev,
 		return 1;
 	}
 
+	if (s->rwh_policy == RWH_POLICY_PPL && !st->ss->supports_ppl) {
+		pr_err("%s metadata does not support PPL\n", st->ss->name);
+		return 1;
+	}
+
 	/* We need to create the device */
 	map_lock(&map);
 	mdfd = create_mddev(mddev, name, c->autof, LOCAL, chosen_name);
@@ -720,7 +725,7 @@ int Create(struct supertype *st, char *mddev,
 				name += 2;
 		}
 	}
-	if (!st->ss->init_super(st, &info.array, s->size, name, c->homehost, uuid,
+	if (!st->ss->init_super(st, &info.array, s, name, c->homehost, uuid,
 				data_offset))
 		goto abort_locked;
 
diff --git a/Detail.c b/Detail.c
index 925e479..d79503d 100644
--- a/Detail.c
+++ b/Detail.c
@@ -504,15 +504,29 @@ int Detail(char *dev, struct context *c)
 		case 10:
 		case 6:
 			if (array.chunk_size)
-				printf("     Chunk Size : %dK\n\n",
+				printf("     Chunk Size : %dK\n",
 				       array.chunk_size/1024);
 			break;
 		case -1:
-			printf("       Rounding : %dK\n\n", array.chunk_size/1024);
+			printf("       Rounding : %dK\n", array.chunk_size/1024);
 			break;
 		default: break;
 		}
 
+		if (array.level == 4 || array.level == 5 || array.level == 6) {
+			struct mdinfo *mdi = sysfs_read(fd, NULL,
+							GET_RWH_POLICY);
+			if (mdi) {
+				char *rwh_policy = map_num(rwh_policies,
+							   mdi->rwh_policy);
+				sysfs_free(mdi);
+				if (rwh_policy)
+					printf("     RWH Policy : %s\n",
+					       rwh_policy);
+			}
+		}
+		printf("\n");
+
 		if (e && e->percent >= 0) {
 			static char *sync_action[] = {
 				"Rebuild", "Resync",
diff --git a/Kill.c b/Kill.c
index f2fdb85..ff52561 100644
--- a/Kill.c
+++ b/Kill.c
@@ -63,7 +63,7 @@ int Kill(char *dev, struct supertype *st, int force, int verbose, int noexcl)
 	rv = st->ss->load_super(st, fd, dev);
 	if (rv == 0 || (force && rv >= 2)) {
 		st->ss->free_super(st);
-		st->ss->init_super(st, NULL, 0, "", NULL, NULL,
+		st->ss->init_super(st, NULL, NULL, "", NULL, NULL,
 				   INVALID_SECTORS);
 		if (st->ss->store_super(st, fd)) {
 			if (verbose >= 0)
diff --git a/ReadMe.c b/ReadMe.c
index d3fcb61..11165a8 100644
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -143,6 +143,7 @@ struct option long_options[] = {
     {"nodes",1, 0, Nodes}, /* also for --assemble */
     {"home-cluster",1, 0, ClusterName},
     {"write-journal",1, 0, WriteJournal},
+    {"rwh-policy",1, 0, RwhPolicy},
 
     /* For assemble */
     {"uuid",      1, 0, 'u'},
diff --git a/maps.c b/maps.c
index 64f1df2..2e8dd9e 100644
--- a/maps.c
+++ b/maps.c
@@ -129,6 +129,13 @@ mapping_t faultylayout[] = {
 	{ NULL, 0}
 };
 
+mapping_t rwh_policies[] = {
+	{ "off", RWH_POLICY_OFF},
+	{ "journal", RWH_POLICY_JOURNAL},
+	{ "ppl", RWH_POLICY_PPL},
+	{ NULL, 0}
+};
+
 char *map_num(mapping_t *map, int num)
 {
 	while (map->name) {
diff --git a/mdadm.c b/mdadm.c
index cca0933..9ecdce6 100644
--- a/mdadm.c
+++ b/mdadm.c
@@ -78,6 +78,7 @@ int main(int argc, char *argv[])
 		.level		= UnSet,
 		.layout		= UnSet,
 		.bitmap_chunk	= UnSet,
+		.rwh_policy	= UnSet,
 	};
 
 	char sys_hostname[256];
@@ -1198,6 +1199,13 @@ int main(int argc, char *argv[])
 
 			s.journaldisks = 1;
 			continue;
+		case O(CREATE, RwhPolicy):
+			s.rwh_policy = map_name(rwh_policies, optarg);
+			if (s.rwh_policy == UnSet) {
+				pr_err("Invalid RWH policy: %s\n", optarg);
+				exit(2);
+			}
+			continue;
 		}
 		/* We have now processed all the valid options. Anything else is
 		 * an error
@@ -1225,9 +1233,30 @@ int main(int argc, char *argv[])
 		exit(0);
 	}
 
-	if (s.journaldisks && (s.level < 4 || s.level > 6)) {
-		pr_err("--write-journal is only supported for RAID level 4/5/6.\n");
-		exit(2);
+	if (s.journaldisks) {
+		if (s.level < 4 || s.level > 6) {
+			pr_err("--write-journal is only supported for RAID level 4/5/6.\n");
+			exit(2);
+		}
+		if (s.rwh_policy == UnSet) {
+			s.rwh_policy = RWH_POLICY_JOURNAL;
+		} else if (s.rwh_policy != RWH_POLICY_JOURNAL) {
+			pr_err("--write-journal is not supported with RWH policy: %s\n",
+			       map_num(rwh_policies, s.rwh_policy));
+			exit(2);
+		}
+	}
+
+	if (s.rwh_policy != UnSet) {
+		if (s.level < 4 || s.level > 6) {
+			pr_err("--rwh-policy is only supported for RAID level 4/5/6.\n");
+			exit(2);
+		}
+		if (s.rwh_policy == RWH_POLICY_JOURNAL && !s.journaldisks) {
+			pr_err("--write-journal is required for RWH policy: %s\n",
+			       map_num(rwh_policies, s.rwh_policy));
+			exit(2);
+		}
 	}
 
 	if (!mode && devs_found) {
diff --git a/mdadm.h b/mdadm.h
index 240ab7f..4eabf59 100755
--- a/mdadm.h
+++ b/mdadm.h
@@ -268,6 +268,13 @@ struct mdinfo {
 	int journal_device_required;
 	int journal_clean;
 
+	enum {
+		RWH_POLICY_UNKNOWN,
+		RWH_POLICY_OFF,
+		RWH_POLICY_JOURNAL,
+		RWH_POLICY_PPL,
+	} rwh_policy;
+
 	/* During reshape we can sometimes change the data_offset to avoid
 	 * over-writing still-valid data.  We need to know if there is space.
 	 * So getinfo_super will fill in space_before and space_after in sectors.
@@ -409,6 +416,7 @@ enum special_options {
 	ClusterName,
 	ClusterConfirm,
 	WriteJournal,
+	RwhPolicy,
 };
 
 enum prefix_standard {
@@ -506,6 +514,7 @@ struct shape {
 	int	assume_clean;
 	int	write_behind;
 	unsigned long long size;
+	int	rwh_policy;
 };
 
 /* List of device names - wildcards expanded */
@@ -596,6 +605,7 @@ enum sysfs_read_flags {
 	GET_STATE	= (1 << 23),
 	GET_ERROR	= (1 << 24),
 	GET_ARRAY_STATE = (1 << 25),
+	GET_RWH_POLICY	= (1 << 26),
 };
 
 /* If fd >= 0, get the array it is open on,
@@ -679,7 +689,7 @@ extern int restore_stripes(int *dest, unsigned long long *offsets,
 
 extern char *map_num(mapping_t *map, int num);
 extern int map_name(mapping_t *map, char *name);
-extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[];
+extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[], rwh_policies[];
 
 extern char *map_dev_preferred(int major, int minor, int create,
 			       char *prefer);
@@ -839,7 +849,7 @@ extern struct superswitch {
 	 * metadata.
 	 */
 	int (*init_super)(struct supertype *st, mdu_array_info_t *info,
-			  unsigned long long size, char *name,
+			  struct shape *s, char *name,
 			  char *homehost, int *uuid,
 			  unsigned long long data_offset);
 
@@ -1035,6 +1045,7 @@ extern struct superswitch {
 	/* validate container after assemble */
 	int (*validate_container)(struct mdinfo *info);
 
+	int supports_ppl;
 	int swapuuid; /* true if uuid is bigending rather than hostendian */
 	int external;
 	const char *name; /* canonical metadata name */
diff --git a/super-ddf.c b/super-ddf.c
index 1707ad1..18e1e77 100644
--- a/super-ddf.c
+++ b/super-ddf.c
@@ -2290,7 +2290,7 @@ static unsigned int find_vde_by_guid(const struct ddf_super *ddf,
 
 static int init_super_ddf(struct supertype *st,
 			  mdu_array_info_t *info,
-			  unsigned long long size, char *name, char *homehost,
+			  struct shape *s, char *name, char *homehost,
 			  int *uuid, unsigned long long data_offset)
 {
 	/* This is primarily called by Create when creating a new array.
@@ -2328,7 +2328,7 @@ static int init_super_ddf(struct supertype *st,
 	struct virtual_disk *vd;
 
 	if (st->sb)
-		return init_super_ddf_bvd(st, info, size, name, homehost, uuid,
+		return init_super_ddf_bvd(st, info, s->size, name, homehost, uuid,
 					  data_offset);
 
 	if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) {
diff --git a/super-intel.c b/super-intel.c
index 69f6201..df09272 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -4887,7 +4887,7 @@ static int check_name(struct intel_super *super, char *name, int quiet)
 }
 
 static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
-				  unsigned long long size, char *name,
+				  struct shape *s, char *name,
 				  char *homehost, int *uuid,
 				  long long data_offset)
 {
@@ -4981,7 +4981,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
 	strncpy((char *) dev->volume, name, MAX_RAID_SERIAL_LEN);
 	array_blocks = calc_array_size(info->level, info->raid_disks,
 					       info->layout, info->chunk_size,
-					       size * 2);
+					       s->size * 2);
 	/* round array size down to closest MB */
 	array_blocks = (array_blocks >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT;
 
@@ -4995,7 +4995,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
 	vol->curr_migr_unit = 0;
 	map = get_imsm_map(dev, MAP_0);
 	set_pba_of_lba0(map, super->create_offset);
-	set_blocks_per_member(map, info_to_blocks_per_member(info, size));
+	set_blocks_per_member(map, info_to_blocks_per_member(info, s->size));
 	map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info));
 	map->failed_disk_num = ~0;
 	if (info->level > 0)
@@ -5023,7 +5023,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
 		map->num_domains = 1;
 
 	/* info->size is only int so use the 'size' parameter instead */
-	num_data_stripes = (size * 2) / info_to_blocks_per_strip(info);
+	num_data_stripes = (s->size * 2) / info_to_blocks_per_strip(info);
 	num_data_stripes /= map->num_domains;
 	set_num_data_stripes(map, num_data_stripes);
 
@@ -5045,7 +5045,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
 }
 
 static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
-			   unsigned long long size, char *name,
+		           struct shape *s, char *name,
 			   char *homehost, int *uuid,
 			   unsigned long long data_offset)
 {
@@ -5068,7 +5068,7 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
 	}
 
 	if (st->sb)
-		return init_super_imsm_volume(st, info, size, name, homehost, uuid,
+		return init_super_imsm_volume(st, info, s, name, homehost, uuid,
 					      data_offset);
 
 	if (info)
diff --git a/super0.c b/super0.c
index 55ebd8b..151e52a 100644
--- a/super0.c
+++ b/super0.c
@@ -721,7 +721,7 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
  * We use the first 8 bytes (64bits) of the sha1 of the host name
  */
 static int init_super0(struct supertype *st, mdu_array_info_t *info,
-		       unsigned long long size, char *ignored_name,
+		       struct shape *s, char *ignored_name,
 		       char *homehost, int *uuid,
 		       unsigned long long data_offset)
 {
@@ -760,8 +760,8 @@ static int init_super0(struct supertype *st, mdu_array_info_t *info,
 	sb->gvalid_words = 0; /* ignored */
 	sb->ctime = time(0);
 	sb->level = info->level;
-	sb->size = size;
-	if (size != (unsigned long long)sb->size)
+	sb->size = s->size;
+	if (s->size != (unsigned long long)sb->size)
 		return 0;
 	sb->nr_disks = info->nr_disks;
 	sb->raid_disks = info->raid_disks;
diff --git a/super1.c b/super1.c
index d323439..8a98ac2 100644
--- a/super1.c
+++ b/super1.c
@@ -1388,7 +1388,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
 }
 
 static int init_super1(struct supertype *st, mdu_array_info_t *info,
-		       unsigned long long size, char *name, char *homehost,
+		       struct shape *s, char *name, char *homehost,
 		       int *uuid, unsigned long long data_offset)
 {
 	struct mdp_superblock_1 *sb;
@@ -1441,7 +1441,7 @@ static int init_super1(struct supertype *st, mdu_array_info_t *info,
 	sb->ctime = __cpu_to_le64((unsigned long long)time(0));
 	sb->level = __cpu_to_le32(info->level);
 	sb->layout = __cpu_to_le32(info->layout);
-	sb->size = __cpu_to_le64(size*2ULL);
+	sb->size = __cpu_to_le64(s->size*2ULL);
 	sb->chunksize = __cpu_to_le32(info->chunk_size>>9);
 	sb->raid_disks = __cpu_to_le32(info->raid_disks);
 
diff --git a/sysfs.c b/sysfs.c
index f8a9f0b..4772d77 100644
--- a/sysfs.c
+++ b/sysfs.c
@@ -240,6 +240,17 @@ struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options)
 	} else
 		sra->sysfs_array_state[0] = 0;
 
+	if (options & GET_RWH_POLICY) {
+		strcpy(base, "rwh_policy");
+		if (load_sys(fname, buf, sizeof(buf))) {
+			sra->rwh_policy = RWH_POLICY_UNKNOWN;
+		} else {
+			sra->rwh_policy = map_name(rwh_policies, buf);
+			if (sra->rwh_policy == UnSet)
+				sra->rwh_policy = RWH_POLICY_UNKNOWN;
+		}
+	}
+
 	if (! (options & GET_DEVS))
 		return sra;
 
-- 
2.10.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/7] imsm: PPL support
  2016-11-24 12:29 [PATCH 0/7] mdadm support for Partial Parity Log Artur Paszkiewicz
  2016-11-24 12:29 ` [PATCH 1/7] imsm: metadata changes for PPL Artur Paszkiewicz
  2016-11-24 12:29 ` [PATCH 2/7] Generic support for --rwh-policy and PPL Artur Paszkiewicz
@ 2016-11-24 12:29 ` Artur Paszkiewicz
  2016-11-28 23:51   ` Jes Sorensen
  2016-11-24 12:29 ` [PATCH 4/7] super1: " Artur Paszkiewicz
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 17+ messages in thread
From: Artur Paszkiewicz @ 2016-11-24 12:29 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid, Artur Paszkiewicz

Enable creating and assembling IMSM raid5 arrays with PPL.

Write the IMSM MPB location for a device to the newly added rdev
sb_start sysfs attribute and 'journal_ppl' to 'state' attribute for
every active member.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 mdadm.h       |  1 +
 super-intel.c | 33 +++++++++++++++++++++++++++++++++
 sysfs.c       |  4 ++++
 3 files changed, 38 insertions(+)

diff --git a/mdadm.h b/mdadm.h
index 4eabf59..5600341 100755
--- a/mdadm.h
+++ b/mdadm.h
@@ -252,6 +252,7 @@ struct mdinfo {
 	unsigned long long	custom_array_size; /* size for non-default sized
 						    * arrays (in sectors)
 						    */
+	unsigned long long	sb_start;
 #define NO_RESHAPE		0
 #define VOLUME_RESHAPE		1
 #define CONTAINER_RESHAPE	2
diff --git a/super-intel.c b/super-intel.c
index df09272..79a3d78 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -1261,6 +1261,15 @@ static void print_imsm_dev(struct intel_super *super,
 	}
 	printf("\n");
 	printf("    Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean");
+	printf("     RWH Policy : ");
+	if (dev->rwh_policy == RWH_OFF)
+		printf("off\n");
+	else if (dev->rwh_policy == RWH_DISTRIBUTED)
+		printf("PPL distributed\n");
+	else if (dev->rwh_policy == RWH_JOURNALING_DRIVE)
+		printf("PPL journaling drive\n");
+	else
+		printf("<unknown:%d>\n", dev->rwh_policy);
 }
 
 static void print_imsm_disk(struct imsm_disk *disk, int index, __u32 reserved)
@@ -3043,6 +3052,15 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
 			}
 		}
 	}
+
+	if (info->array.level == 5) {
+		if (dev->rwh_policy == RWH_OFF)
+			info->rwh_policy = RWH_POLICY_OFF;
+		else if (dev->rwh_policy == RWH_DISTRIBUTED)
+			info->rwh_policy = RWH_POLICY_PPL;
+		else
+			info->rwh_policy = RWH_POLICY_UNKNOWN;
+	}
 }
 
 static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev,
@@ -3177,6 +3195,9 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *
 
 		disk = &super->disks->disk;
 		info->data_offset = total_blocks(&super->disks->disk) - reserved;
+		/* mpb anchor sector - see store_imsm_mpb() */
+		info->sb_start = total_blocks(&super->disks->disk) -
+				 ((2 * super->sector_size) >> 9);
 		info->component_size = reserved;
 		info->disk.state  = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0;
 		/* we don't change info->disk.raid_disk here because
@@ -5034,6 +5055,17 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
 	}
 	mpb->num_raid_devs++;
 
+	if (s->rwh_policy == UnSet || s->rwh_policy == RWH_POLICY_OFF) {
+		dev->rwh_policy = RWH_OFF;
+	} else if (s->rwh_policy == RWH_POLICY_PPL) {
+		dev->rwh_policy = RWH_DISTRIBUTED;
+	} else {
+		free(dev);
+		free(dv);
+		pr_err("imsm supports only PPL RWH Policy\n");
+		return 0;
+	}
+
 	dv->dev = dev;
 	dv->index = super->current_vol;
 	dv->next = super->devlist;
@@ -11061,6 +11093,7 @@ struct superswitch super_imsm = {
 	.container_content = container_content_imsm,
 	.validate_container = validate_container_imsm,
 
+	.supports_ppl	= 1,
 	.external	= 1,
 	.name = "imsm",
 
diff --git a/sysfs.c b/sysfs.c
index 4772d77..b4437a3 100644
--- a/sysfs.c
+++ b/sysfs.c
@@ -732,7 +732,11 @@ int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume)
 			rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk);
 		if (resume)
 			sysfs_set_num(sra, sd, "recovery_start", sd->recovery_start);
+		if (sra->rwh_policy == RWH_POLICY_PPL &&
+		    (sd->recovery_start == MaxSector || !resume))
+			sysfs_set_str(sra, sd, "state", "journal_ppl");
 	}
+	sysfs_set_num(sra, sd, "sb_start", sd->sb_start);
 	return rv;
 }
 
-- 
2.10.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 4/7] super1: PPL support
  2016-11-24 12:29 [PATCH 0/7] mdadm support for Partial Parity Log Artur Paszkiewicz
                   ` (2 preceding siblings ...)
  2016-11-24 12:29 ` [PATCH 3/7] imsm: PPL support Artur Paszkiewicz
@ 2016-11-24 12:29 ` Artur Paszkiewicz
  2016-11-24 12:29 ` [PATCH 5/7] imsm: allow to assemble with PPL even if dirty degraded Artur Paszkiewicz
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 17+ messages in thread
From: Artur Paszkiewicz @ 2016-11-24 12:29 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid, Artur Paszkiewicz

Enable creating and assembling raid5 arrays with PPL for 1.1 and 1.2
metadata.

When creating, reserve enough space for PPL and store its size and
location in the superblock and set MD_FEATURE_PPL bit. PPL is stored in
the metadata region reserved for internal write-intent bitmap, so don't
allow using bitmap and PPL together.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 Create.c      | 19 +++++++++++--
 Grow.c        | 15 +++++++++-
 mdadm.h       |  4 ++-
 super-ddf.c   |  2 +-
 super-gpt.c   |  2 +-
 super-intel.c |  4 +--
 super-mbr.c   |  2 +-
 super0.c      |  2 +-
 super1.c      | 88 +++++++++++++++++++++++++++++++++++++++++++++++++----------
 9 files changed, 113 insertions(+), 25 deletions(-)

diff --git a/Create.c b/Create.c
index 52e7e2b..590ed69 100644
--- a/Create.c
+++ b/Create.c
@@ -201,6 +201,15 @@ int Create(struct supertype *st, char *mddev,
 		return 1;
 	}
 
+	if (s->rwh_policy == RWH_POLICY_PPL) {
+		if (s->bitmap_file) {
+			pr_err("PPL is not compatible with bitmap\n");
+			return 1;
+		} else {
+			s->bitmap_file = "none";
+		}
+	}
+
 	/* now set some defaults */
 
 	if (s->layout == UnSet) {
@@ -259,7 +268,8 @@ int Create(struct supertype *st, char *mddev,
 	if (st && ! st->ss->validate_geometry(st, s->level, s->layout, s->raiddisks,
 					      &s->chunk, s->size*2,
 					      data_offset, NULL,
-					      &newsize, c->verbose>=0))
+					      &newsize, s->rwh_policy,
+					      c->verbose>=0))
 		return 1;
 
 	if (s->chunk && s->chunk != UnSet) {
@@ -358,7 +368,8 @@ int Create(struct supertype *st, char *mddev,
 						st, s->level, s->layout, s->raiddisks,
 						&s->chunk, s->size*2,
 						dv->data_offset, dname,
-						&freesize, c->verbose > 0)) {
+						&freesize, s->rwh_policy,
+						c->verbose > 0)) {
 				case -1: /* Not valid, message printed, and not
 					  * worth checking any further */
 					exit(2);
@@ -395,6 +406,7 @@ int Create(struct supertype *st, char *mddev,
 						       &s->chunk, s->size*2,
 						       dv->data_offset,
 						       dname, &freesize,
+						       s->rwh_policy,
 						       c->verbose >= 0)) {
 
 				pr_err("%s is not suitable for this array.\n",
@@ -501,7 +513,8 @@ int Create(struct supertype *st, char *mddev,
 						       s->raiddisks,
 						       &s->chunk, minsize*2,
 						       data_offset,
-						       NULL, NULL, 0)) {
+						       NULL, NULL,
+						       s->rwh_policy, 0)) {
 				pr_err("devices too large for RAID level %d\n", s->level);
 				return 1;
 			}
diff --git a/Grow.c b/Grow.c
index 455c5f9..8bfb09c 100755
--- a/Grow.c
+++ b/Grow.c
@@ -290,6 +290,7 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
 	int major = BITMAP_MAJOR_HI;
 	int vers = md_get_version(fd);
 	unsigned long long bitmapsize, array_size;
+	struct mdinfo *mdi;
 
 	if (vers < 9003) {
 		major = BITMAP_MAJOR_HOSTENDIAN;
@@ -389,12 +390,23 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
 		free(st);
 		return 1;
 	}
+
+	mdi = sysfs_read(fd, NULL, GET_RWH_POLICY);
+	if (mdi) {
+		if (mdi->rwh_policy == RWH_POLICY_PPL) {
+			pr_err("Cannot add bitmap to array with PPL\n");
+			free(mdi);
+			free(st);
+			return 1;
+		}
+		free(mdi);
+	}
+
 	if (strcmp(s->bitmap_file, "internal") == 0 ||
 	    strcmp(s->bitmap_file, "clustered") == 0) {
 		int rv;
 		int d;
 		int offset_setable = 0;
-		struct mdinfo *mdi;
 		if (st->ss->add_internal_bitmap == NULL) {
 			pr_err("Internal bitmaps not supported with %s metadata\n", st->ss->name);
 			return 1;
@@ -446,6 +458,7 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
 			sysfs_init(mdi, fd, NULL);
 			rv = sysfs_set_num_signed(mdi, NULL, "bitmap/location",
 						  mdi->bitmap_offset);
+			free(mdi);
 		} else {
 			if (strcmp(s->bitmap_file, "clustered") == 0)
 				array.state |= (1 << MD_SB_CLUSTERED);
diff --git a/mdadm.h b/mdadm.h
index 5600341..570d108 100755
--- a/mdadm.h
+++ b/mdadm.h
@@ -288,6 +288,8 @@ struct mdinfo {
 		#define MaxSector  (~0ULL) /* resync/recovery complete position */
 	};
 	long			bitmap_offset;	/* 0 == none, 1 == a file */
+	unsigned int		ppl_offset;
+	unsigned int		ppl_sectors;
 	unsigned long		safe_mode_delay; /* ms delay to mark clean */
 	int			new_level, delta_disks, new_layout, new_chunk;
 	int			errors;
@@ -948,7 +950,7 @@ extern struct superswitch {
 				 int *chunk, unsigned long long size,
 				 unsigned long long data_offset,
 				 char *subdev, unsigned long long *freesize,
-				 int verbose);
+				 int rwh_policy, int verbose);
 
 	/* Return a linked list of 'mdinfo' structures for all arrays
 	 * in the container.  For non-containers, it is like
diff --git a/super-ddf.c b/super-ddf.c
index 18e1e77..6184a73 100644
--- a/super-ddf.c
+++ b/super-ddf.c
@@ -3347,7 +3347,7 @@ static int validate_geometry_ddf(struct supertype *st,
 				 int *chunk, unsigned long long size,
 				 unsigned long long data_offset,
 				 char *dev, unsigned long long *freesize,
-				 int verbose)
+				 int rwh_policy, int verbose)
 {
 	int fd;
 	struct mdinfo *sra;
diff --git a/super-gpt.c b/super-gpt.c
index 1a2adce..efb0c00 100644
--- a/super-gpt.c
+++ b/super-gpt.c
@@ -195,7 +195,7 @@ static int validate_geometry(struct supertype *st, int level,
 			     int *chunk, unsigned long long size,
 			     unsigned long long data_offset,
 			     char *subdev, unsigned long long *freesize,
-			     int verbose)
+			     int rwh_policy, int verbose)
 {
 	pr_err("gpt metadata cannot be used this way\n");
 	return 0;
diff --git a/super-intel.c b/super-intel.c
index 79a3d78..7f12230 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -6631,7 +6631,7 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout,
 				  int raiddisks, int *chunk, unsigned long long size,
 				  unsigned long long data_offset,
 				  char *dev, unsigned long long *freesize,
-				  int verbose)
+				  int rwh_policy, int verbose)
 {
 	int fd, cfd;
 	struct mdinfo *sra;
@@ -10447,7 +10447,7 @@ enum imsm_reshape_type imsm_analyze_change(struct supertype *st,
 				    geo->raid_disks + devNumChange,
 				    &chunk,
 				    geo->size, INVALID_SECTORS,
-				    0, 0, 1))
+				    0, 0, info.rwh_policy, 1))
 		change = -1;
 
 	if (check_devs) {
diff --git a/super-mbr.c b/super-mbr.c
index f5e4cea..66d984c 100644
--- a/super-mbr.c
+++ b/super-mbr.c
@@ -193,7 +193,7 @@ static int validate_geometry(struct supertype *st, int level,
 			     int *chunk, unsigned long long size,
 			     unsigned long long data_offset,
 			     char *subdev, unsigned long long *freesize,
-			     int verbose)
+			     int rwh_policy, int verbose)
 {
 	pr_err("mbr metadata cannot be used this way\n");
 	return 0;
diff --git a/super0.c b/super0.c
index 151e52a..be6256f 100644
--- a/super0.c
+++ b/super0.c
@@ -1263,7 +1263,7 @@ static int validate_geometry0(struct supertype *st, int level,
 			      int *chunk, unsigned long long size,
 			      unsigned long long data_offset,
 			      char *subdev, unsigned long long *freesize,
-			      int verbose)
+			      int rwh_policy, int verbose)
 {
 	unsigned long long ldsize;
 	int fd;
diff --git a/super1.c b/super1.c
index 8a98ac2..b5825e6 100644
--- a/super1.c
+++ b/super1.c
@@ -48,10 +48,18 @@ struct mdp_superblock_1 {
 
 	__u32	chunksize;	/* in 512byte sectors */
 	__u32	raid_disks;
-	__u32	bitmap_offset;	/* sectors after start of superblock that bitmap starts
-				 * NOTE: signed, so bitmap can be before superblock
-				 * only meaningful of feature_map[0] is set.
-				 */
+	union {
+		__u32	bitmap_offset;	/* sectors after start of superblock that bitmap starts
+					 * NOTE: signed, so bitmap can be before superblock
+					 * only meaningful of feature_map[0] is set.
+					 */
+
+		/* only meaningful when feature_map[MD_FEATURE_PPL] is set */
+		struct {
+			__u16 offset; /* sectors after start of superblock that ppl starts */
+			__u16 size; /* PPL size (including header) in sectors */
+		} ppl;
+	};
 
 	/* These are only valid with feature bit '4' */
 	__u32	new_level;	/* new level we are reshaping to		*/
@@ -130,6 +138,7 @@ struct misc_dev_info {
 #define	MD_FEATURE_NEW_OFFSET		64 /* new_offset must be honoured */
 #define	MD_FEATURE_BITMAP_VERSIONED	256 /* bitmap version number checked properly */
 #define	MD_FEATURE_JOURNAL		512 /* support write journal */
+#define	MD_FEATURE_PPL			1024 /* support PPL */
 #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
 					|MD_FEATURE_RECOVERY_OFFSET	\
 					|MD_FEATURE_RESHAPE_ACTIVE	\
@@ -139,6 +148,7 @@ struct misc_dev_info {
 					|MD_FEATURE_NEW_OFFSET		\
 					|MD_FEATURE_BITMAP_VERSIONED	\
 					|MD_FEATURE_JOURNAL		\
+					|MD_FEATURE_PPL			\
 					)
 
 #ifndef MDASSEMBLE
@@ -288,6 +298,11 @@ static int awrite(struct align_fd *afd, void *buf, int len)
 	return len;
 }
 
+static inline unsigned int choose_ppl_space(int chunk)
+{
+	return 8 + (chunk > 128*2 ? chunk : 128*2);
+}
+
 #ifndef MDASSEMBLE
 static void examine_super1(struct supertype *st, char *homehost)
 {
@@ -391,6 +406,10 @@ static void examine_super1(struct supertype *st, char *homehost)
 	if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) {
 		printf("Internal Bitmap : %ld sectors from superblock\n",
 		       (long)(int32_t)__le32_to_cpu(sb->bitmap_offset));
+	} else if (sb->feature_map & __cpu_to_le32(MD_FEATURE_PPL)) {
+		printf("            PPL : %u sectors at offset %u sectors from superblock\n",
+		       __le16_to_cpu(sb->ppl.size),
+		       __le16_to_cpu(sb->ppl.offset));
 	}
 	if (sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE)) {
 		printf("  Reshape pos'n : %llu%s\n", (unsigned long long)__le64_to_cpu(sb->reshape_position)/2,
@@ -932,8 +951,12 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
 
 	info->data_offset = __le64_to_cpu(sb->data_offset);
 	info->component_size = __le64_to_cpu(sb->size);
-	if (sb->feature_map & __le32_to_cpu(MD_FEATURE_BITMAP_OFFSET))
+	if (sb->feature_map & __le32_to_cpu(MD_FEATURE_BITMAP_OFFSET)) {
 		info->bitmap_offset = (int32_t)__le32_to_cpu(sb->bitmap_offset);
+	} else if (sb->feature_map & __le32_to_cpu(MD_FEATURE_PPL)) {
+		info->ppl_offset = __le16_to_cpu(sb->ppl.offset);
+		info->ppl_sectors = __le16_to_cpu(sb->ppl.size);
+	}
 
 	info->disk.major = 0;
 	info->disk.minor = 0;
@@ -978,6 +1001,11 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
 			bmend += size;
 			if (bmend > earliest)
 				earliest = bmend;
+		} else if (info->ppl_offset > 0) {
+			unsigned long long pplend = info->ppl_offset +
+						    info->ppl_sectors;
+			if (pplend > earliest)
+				earliest = pplend;
 		}
 		if (sb->bblog_offset && sb->bblog_size) {
 			unsigned long long bbend = super_offset;
@@ -1069,9 +1097,16 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
 	}
 
 	info->array.working_disks = working;
-	if (sb->feature_map & __le32_to_cpu(MD_FEATURE_JOURNAL))
+
+	if (sb->feature_map & __le32_to_cpu(MD_FEATURE_JOURNAL)) {
 		info->journal_device_required = 1;
-	info->journal_clean = 0;
+		info->rwh_policy = RWH_POLICY_JOURNAL;
+	} else if (sb->feature_map & __le32_to_cpu(MD_FEATURE_PPL)) {
+		info->rwh_policy = RWH_POLICY_PPL;
+		info->journal_clean = 1;
+	} else {
+		info->rwh_policy = RWH_POLICY_UNKNOWN;
+	}
 }
 
 static struct mdinfo *container_content1(struct supertype *st, char *subarray)
@@ -1233,6 +1268,9 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
 		if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) {
 			bitmap_offset = (long)__le32_to_cpu(sb->bitmap_offset);
 			bm_sectors = calc_bitmap_size(bms, 4096) >> 9;
+		} else if (sb->feature_map & __cpu_to_le32(MD_FEATURE_PPL)) {
+			bitmap_offset = (long)__le16_to_cpu(sb->ppl.offset);
+			bm_sectors = (long)__le16_to_cpu(sb->ppl.size);
 		}
 #endif
 		if (sb_offset < data_offset) {
@@ -1462,6 +1500,9 @@ static int init_super1(struct supertype *st, mdu_array_info_t *info,
 
 	memset(sb->dev_roles, 0xff, MAX_SB_SIZE - sizeof(struct mdp_superblock_1));
 
+	if (s->rwh_policy == RWH_POLICY_PPL)
+		sb->feature_map |= __cpu_to_le32(MD_FEATURE_PPL);
+
 	return 1;
 }
 
@@ -1663,7 +1704,7 @@ static int write_empty_r5l_meta_block(struct supertype *st, int fd)
 	crc = crc32c_le(crc, (void *)mb, META_BLOCK_SIZE);
 	mb->checksum = crc;
 
-	if (lseek64(fd, (sb->data_offset) * 512, 0) < 0LL) {
+	if (lseek64(fd, __le64_to_cpu(sb->data_offset) * 512, 0) < 0LL) {
 		pr_err("cannot seek to offset of the meta block\n");
 		goto fail_to_write;
 	}
@@ -1696,7 +1737,7 @@ static int write_init_super1(struct supertype *st)
 
 	for (di = st->info; di; di = di->next) {
 		if (di->disk.state & (1 << MD_DISK_JOURNAL))
-			sb->feature_map |= MD_FEATURE_JOURNAL;
+			sb->feature_map |= __cpu_to_le32(MD_FEATURE_JOURNAL);
 	}
 
 	for (di = st->info; di; di = di->next) {
@@ -1767,6 +1808,11 @@ static int write_init_super1(struct supertype *st)
 					(((char *)sb) + MAX_SB_SIZE);
 			bm_space = calc_bitmap_size(bms, 4096) >> 9;
 			bm_offset = (long)__le32_to_cpu(sb->bitmap_offset);
+		} else if (sb->feature_map & __cpu_to_le32(MD_FEATURE_PPL)) {
+			bm_space = choose_ppl_space(__le32_to_cpu(sb->chunksize));
+			bm_offset = 8;
+			sb->ppl.offset = __cpu_to_le16(bm_offset);
+			sb->ppl.size = __cpu_to_le16(bm_space);
 		} else {
 			bm_space = choose_bm_space(array_size);
 			bm_offset = 8;
@@ -1838,8 +1884,10 @@ static int write_init_super1(struct supertype *st)
 				goto error_out;
 		}
 
-		if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1))
+		if (rv == 0 &&
+		    (__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET))
 			rv = st->ss->write_bitmap(st, di->fd, NodeNumUpdate);
+
 		close(di->fd);
 		di->fd = -1;
 		if (rv)
@@ -2107,11 +2155,13 @@ static __u64 avail_size1(struct supertype *st, __u64 devsize,
 		return 0;
 
 #ifndef MDASSEMBLE
-	if (__le32_to_cpu(super->feature_map)&MD_FEATURE_BITMAP_OFFSET) {
+	if (__le32_to_cpu(super->feature_map) & MD_FEATURE_BITMAP_OFFSET) {
 		/* hot-add. allow for actual size of bitmap */
 		struct bitmap_super_s *bsb;
 		bsb = (struct bitmap_super_s *)(((char*)super)+MAX_SB_SIZE);
 		bmspace = calc_bitmap_size(bsb, 4096) >> 9;
+	} else if (__le32_to_cpu(super->feature_map) & MD_FEATURE_PPL) {
+		bmspace = __le16_to_cpu(super->ppl.size);
 	}
 #endif
 	/* Allow space for bad block log */
@@ -2479,7 +2529,7 @@ static int validate_geometry1(struct supertype *st, int level,
 			      int *chunk, unsigned long long size,
 			      unsigned long long data_offset,
 			      char *subdev, unsigned long long *freesize,
-			      int verbose)
+			      int rwh_policy, int verbose)
 {
 	unsigned long long ldsize, devsize;
 	int bmspace;
@@ -2501,6 +2551,14 @@ static int validate_geometry1(struct supertype *st, int level,
 		/* not specified, so time to set default */
 		st->minor_version = 2;
 
+	if (st->minor_version != 2 && st->minor_version != 1 &&
+	    rwh_policy == RWH_POLICY_PPL) {
+		if (verbose)
+			pr_err("1.%d metadata does not support PPL\n",
+			       st->minor_version);
+		return 0;
+	}
+
 	fd = open(subdev, O_RDONLY|O_EXCL, 0);
 	if (fd < 0) {
 		if (verbose)
@@ -2521,8 +2579,9 @@ static int validate_geometry1(struct supertype *st, int level,
 		return 0;
 	}
 
-	/* creating:  allow suitable space for bitmap */
-	bmspace = choose_bm_space(devsize);
+	/* creating:  allow suitable space for bitmap or PPL */
+	bmspace = rwh_policy == RWH_POLICY_PPL ?
+		  choose_ppl_space((*chunk)*2) : choose_bm_space(devsize);
 
 	if (data_offset == INVALID_SECTORS)
 		data_offset = st->data_offset;
@@ -2654,5 +2713,6 @@ struct superswitch super1 = {
 #else
 	.swapuuid = 1,
 #endif
+	.supports_ppl = 1,
 	.name = "1.x",
 };
-- 
2.10.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 5/7] imsm: allow to assemble with PPL even if dirty degraded
  2016-11-24 12:29 [PATCH 0/7] mdadm support for Partial Parity Log Artur Paszkiewicz
                   ` (3 preceding siblings ...)
  2016-11-24 12:29 ` [PATCH 4/7] super1: " Artur Paszkiewicz
@ 2016-11-24 12:29 ` Artur Paszkiewicz
  2016-11-24 12:29 ` [PATCH 6/7] Allow changing the RWH policy for a running array Artur Paszkiewicz
  2016-11-24 12:29 ` [PATCH 7/7] Man page changes for --rwh-policy Artur Paszkiewicz
  6 siblings, 0 replies; 17+ messages in thread
From: Artur Paszkiewicz @ 2016-11-24 12:29 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid, Pawel Baldysiak, Artur Paszkiewicz

From: Pawel Baldysiak <pawel.baldysiak@intel.com>

This is necessary to allow PPL recovery in the kernel driver. If the
recovery succeeds, the array is no longer in dirty state and can be
started normally as degraded.

Signed-off-by: Pawel Baldysiak <pawel.baldysiak@intel.com>
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 Assemble.c    | 4 +++-
 super-intel.c | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/Assemble.c b/Assemble.c
index 3da0903..022d826 100644
--- a/Assemble.c
+++ b/Assemble.c
@@ -1943,7 +1943,9 @@ int assemble_container_content(struct supertype *st, int mdfd,
 		   content->uuid, chosen_name);
 
 	if (enough(content->array.level, content->array.raid_disks,
-		   content->array.layout, content->array.state & 1, avail) == 0) {
+		   content->array.layout,
+		   (content->array.state & 1) || content->journal_clean,
+		   avail) == 0) {
 		if (c->export && result)
 			*result |= INCR_NO;
 		else if (c->verbose >= 0) {
diff --git a/super-intel.c b/super-intel.c
index 7f12230..e6bd9ec 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -2896,6 +2896,7 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
 	info->custom_array_size   <<= 32;
 	info->custom_array_size   |= __le32_to_cpu(dev->size_low);
 	info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
+	info->journal_clean = dev->rwh_policy;
 
 	if (is_gen_migration(dev)) {
 		info->reshape_active = 1;
-- 
2.10.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 6/7] Allow changing the RWH policy for a running array
  2016-11-24 12:29 [PATCH 0/7] mdadm support for Partial Parity Log Artur Paszkiewicz
                   ` (4 preceding siblings ...)
  2016-11-24 12:29 ` [PATCH 5/7] imsm: allow to assemble with PPL even if dirty degraded Artur Paszkiewicz
@ 2016-11-24 12:29 ` Artur Paszkiewicz
  2016-11-24 12:29 ` [PATCH 7/7] Man page changes for --rwh-policy Artur Paszkiewicz
  6 siblings, 0 replies; 17+ messages in thread
From: Artur Paszkiewicz @ 2016-11-24 12:29 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid, Artur Paszkiewicz

This extends the --rwh-policy parameter to work also in Misc mode. Using
it changes the currently active RWH policy in the kernel driver and
updates the metadata to make this change permanent. Updating metadata is
not yet implemented for super1, so this is limited to IMSM for now.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 Manage.c      | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 mdadm.c       |  9 +++++++
 mdadm.h       |  1 +
 super-intel.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 153 insertions(+), 1 deletion(-)

diff --git a/Manage.c b/Manage.c
index 1b7b0c1..2343eba 100644
--- a/Manage.c
+++ b/Manage.c
@@ -1805,4 +1805,83 @@ int move_spare(char *from_devname, char *to_devname, dev_t devid)
 	close(fd2);
 	return 0;
 }
+
+int ChangeRwhPolicy(char *dev, char *update, int verbose)
+{
+	struct supertype *st;
+	struct mdinfo *info;
+	char *subarray = NULL;
+	int ret = 0;
+	int fd;
+	int new_policy = map_name(rwh_policies, update);
+
+	if (new_policy == UnSet)
+		return 1;
+
+	fd = open(dev, O_RDONLY);
+	if (fd < 0)
+		return 1;
+
+	st = super_by_fd(fd, &subarray);
+	if (!st) {
+		close(fd);
+		return 1;
+	}
+
+	info = sysfs_read(fd, NULL, GET_RWH_POLICY|GET_LEVEL);
+	close(fd);
+	if (!info) {
+		ret = 1;
+		goto free_st;
+	}
+
+	if (new_policy == RWH_POLICY_PPL && !st->ss->supports_ppl) {
+		pr_err("%s metadata does not support PPL\n", st->ss->name);
+		ret = 1;
+		goto free_info;
+	}
+
+	if (info->array.level < 4 || info->array.level > 6) {
+		pr_err("Operation not supported for array level %d\n",
+				info->array.level);
+		ret = 1;
+		goto free_info;
+	}
+
+	if (info->rwh_policy != (unsigned)new_policy) {
+		if (!st->ss->external && new_policy == RWH_POLICY_PPL) {
+			pr_err("Operation supported for external metadata only.\n");
+			ret = 1;
+			goto free_info;
+		}
+
+		if (sysfs_set_str(info, NULL, "rwh_policy", update)) {
+			pr_err("Failed to change array RWH Policy\n");
+			ret = 1;
+			goto free_info;
+		}
+		info->rwh_policy = new_policy;
+	}
+
+	if (subarray) {
+		char container_dev[PATH_MAX];
+		struct mddev_ident ident;
+
+		sprintf(container_dev, "/dev/%s", st->container_devnm);
+
+		st->info = info;
+		ident.st = st;
+
+		ret = Update_subarray(container_dev, subarray, "rwh-policy",
+				&ident, verbose);
+	}
+
+free_info:
+	sysfs_free(info);
+free_st:
+	free(st);
+	free(subarray);
+
+	return ret;
+}
 #endif
diff --git a/mdadm.c b/mdadm.c
index 9ecdce6..82db22c 100644
--- a/mdadm.c
+++ b/mdadm.c
@@ -251,6 +251,7 @@ int main(int argc, char *argv[])
 		case UpdateSubarray:
 		case UdevRules:
 		case KillOpt:
+		case RwhPolicy:
 			if (!mode)
 				newmode = MISC;
 			break;
@@ -1200,11 +1201,16 @@ int main(int argc, char *argv[])
 			s.journaldisks = 1;
 			continue;
 		case O(CREATE, RwhPolicy):
+		case O(MISC, RwhPolicy):
 			s.rwh_policy = map_name(rwh_policies, optarg);
 			if (s.rwh_policy == UnSet) {
 				pr_err("Invalid RWH policy: %s\n", optarg);
 				exit(2);
 			}
+			if (mode == MISC) {
+				devmode = opt;
+				c.update = optarg;
+			}
 			continue;
 		}
 		/* We have now processed all the valid options. Anything else is
@@ -1916,6 +1922,9 @@ static int misc_list(struct mddev_dev *devlist,
 		case Action:
 			rv |= SetAction(dv->devname, c->action);
 			continue;
+		case RwhPolicy:
+			rv |= ChangeRwhPolicy(dv->devname, c->update, c->verbose);
+			continue;
 		}
 		if (dv->devname[0] == '/')
 			mdfd = open_mddev(dv->devname, 1);
diff --git a/mdadm.h b/mdadm.h
index 570d108..60a964a 100755
--- a/mdadm.h
+++ b/mdadm.h
@@ -1332,6 +1332,7 @@ extern int Update_subarray(char *dev, char *subarray, char *update, struct mddev
 extern int Wait(char *dev);
 extern int WaitClean(char *dev, int sock, int verbose);
 extern int SetAction(char *dev, char *action);
+extern int ChangeRwhPolicy(char *dev, char *update, int verbose);
 
 extern int Incremental(struct mddev_dev *devlist, struct context *c,
 		       struct supertype *st);
diff --git a/super-intel.c b/super-intel.c
index e6bd9ec..aa44f8c 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -450,6 +450,7 @@ enum imsm_update_type {
 	update_takeover,
 	update_general_migration_checkpoint,
 	update_size_change,
+	update_rwh_policy,
 };
 
 struct imsm_update_activate_spare {
@@ -538,6 +539,12 @@ struct imsm_update_add_remove_disk {
 	enum imsm_update_type type;
 };
 
+struct imsm_update_rwh_policy {
+	enum imsm_update_type type;
+	int new_policy;
+	int dev_idx;
+};
+
 static const char *_sys_dev_type[] = {
 	[SYS_DEV_UNKNOWN] = "Unknown",
 	[SYS_DEV_SAS] = "SAS",
@@ -2896,7 +2903,6 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
 	info->custom_array_size   <<= 32;
 	info->custom_array_size   |= __le32_to_cpu(dev->size_low);
 	info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
-	info->journal_clean = dev->rwh_policy;
 
 	if (is_gen_migration(dev)) {
 		info->reshape_active = 1;
@@ -3061,6 +3067,8 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
 			info->rwh_policy = RWH_POLICY_PPL;
 		else
 			info->rwh_policy = RWH_POLICY_UNKNOWN;
+
+		info->journal_clean = info->rwh_policy == RWH_POLICY_PPL;
 	}
 }
 
@@ -6875,6 +6883,41 @@ static int update_subarray_imsm(struct supertype *st, char *subarray,
 			}
 			super->updates_pending++;
 		}
+	} else if (strcmp(update, "rwh-policy") == 0) {
+		struct mdinfo *info;
+		int new_policy;
+		char *ep;
+		int vol = strtoul(subarray, &ep, 10);
+
+		if (!ident->st || !ident->st->info)
+			return 2;
+
+		info = ident->st->info;
+
+		if (*ep != '\0' || vol >= super->anchor->num_raid_devs)
+			return 2;
+
+		if (info->rwh_policy == RWH_POLICY_OFF)
+			new_policy = RWH_OFF;
+		else if (info->rwh_policy == RWH_POLICY_PPL)
+			new_policy = RWH_DISTRIBUTED;
+		else
+			return 2;
+
+		if (st->update_tail) {
+			struct imsm_update_rwh_policy *u = xmalloc(sizeof(*u));
+
+			u->type = update_rwh_policy;
+			u->dev_idx = vol;
+			u->new_policy = new_policy;
+			append_metadata_update(st, u, sizeof(*u));
+		} else {
+			struct imsm_dev *dev;
+
+			dev = get_imsm_dev(super, vol);
+			dev->rwh_policy = new_policy;
+			super->updates_pending++;
+		}
 	} else
 		return 2;
 
@@ -9029,6 +9072,21 @@ static void imsm_process_update(struct supertype *st,
 		}
 		break;
 	}
+	case update_rwh_policy: {
+		struct imsm_update_rwh_policy *u = (void *)update->buf;
+		int target = u->dev_idx;
+		struct imsm_dev *dev = get_imsm_dev(super, target);
+		if (!dev) {
+			dprintf("could not find subarray-%d\n", target);
+			break;
+		}
+
+		if (dev->rwh_policy != u->new_policy) {
+			dev->rwh_policy = u->new_policy;
+			super->updates_pending++;
+		}
+		break;
+	}
 	default:
 		pr_err("error: unsuported process update type:(type: %d)\n",	type);
 	}
@@ -9270,6 +9328,11 @@ static int imsm_prepare_update(struct supertype *st,
 	case update_add_remove_disk:
 		/* no update->len needed */
 		break;
+	case update_rwh_policy: {
+		if (update->len < (int)sizeof(struct imsm_update_rwh_policy))
+			return 0;
+		break;
+	}
 	default:
 		return 0;
 	}
-- 
2.10.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 7/7] Man page changes for --rwh-policy
  2016-11-24 12:29 [PATCH 0/7] mdadm support for Partial Parity Log Artur Paszkiewicz
                   ` (5 preceding siblings ...)
  2016-11-24 12:29 ` [PATCH 6/7] Allow changing the RWH policy for a running array Artur Paszkiewicz
@ 2016-11-24 12:29 ` Artur Paszkiewicz
  6 siblings, 0 replies; 17+ messages in thread
From: Artur Paszkiewicz @ 2016-11-24 12:29 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid, Artur Paszkiewicz

Describe the usage of the --rwh-policy parameter in Create and Misc
modes.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 mdadm.8.in | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/mdadm.8.in b/mdadm.8.in
index 3c0c58f..9295dcb 100644
--- a/mdadm.8.in
+++ b/mdadm.8.in
@@ -996,6 +996,26 @@ simultaneously. If not specified, this defaults to 4.
 Specify journal device for the RAID-4/5/6 array. The journal device
 should be a SSD with reasonable lifetime.
 
+.TP
+.BR \-\-rwh-policy=
+Specify the RAID Write Hole policy for a RAID-4/5/6 array. Currently supported
+options are
+.BR off ,
+.B journal
+and
+.BR ppl .
+
+The
+.B journal
+policy is implicitly selected when using
+.BR \-\-write-journal .
+
+The
+.B ppl
+policy (Partial Parity Log) is a mechanism that can be used with RAID5 arrays.
+This feature prevents data loss by keeping parity consistent with data even in
+case of drive failure during dirty shutdown. PPL is stored in the metadata
+region of RAID member drives, no additional journal drive is needed.
 
 .SH For assemble:
 
@@ -1675,6 +1695,14 @@ can be found it
 under
 .BR "SCRUBBING AND MISMATCHES" .
 
+.TP
+.BR \-\-rwh-policy=
+Change the RAID Write Hole policy for a RAID-4/5/6 array at runtime. For
+details about the RWH policies, see the description for the same parameter
+under
+.B Create mode
+options.
+
 .SH For Incremental Assembly mode:
 .TP
 .BR \-\-rebuild\-map ", " \-r
-- 
2.10.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/7] imsm: metadata changes for PPL
  2016-11-24 12:29 ` [PATCH 1/7] imsm: metadata changes for PPL Artur Paszkiewicz
@ 2016-11-28 23:27   ` Jes Sorensen
  2016-11-29 10:47     ` Artur Paszkiewicz
  0 siblings, 1 reply; 17+ messages in thread
From: Jes Sorensen @ 2016-11-28 23:27 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: linux-raid

Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
> Updates for the IMSM metadata format, including PPL header structures.
>
> Extend imsm_vol dirty field adding a third value, which is required to
> enable PPL recovery in Windows and UEFI drivers.
>
> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
> ---
>  super-intel.c | 48 ++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 40 insertions(+), 8 deletions(-)

A couple of comments on this change:

> diff --git a/super-intel.c b/super-intel.c
> index 5740088..69f6201 100644
> --- a/super-intel.c
> +++ b/super-intel.c
> @@ -154,6 +157,9 @@ struct imsm_vol {
>  #define MIGR_STATE_CHANGE 4
>  #define MIGR_REPAIR 5
>  	__u8  migr_type;	/* Initializing, Rebuilding, ... */
> +#define RAIDVOL_CLEAN          0
> +#define RAIDVOL_DIRTY          1
> +#define RAIDVOL_DSRECORD_VALID 2
>  	__u8  dirty;
>  	__u8  fs_state;		/* fast-sync state for CnG (0xff == disabled) */
>  	__u16 verify_errors;	/* number of mismatches */
> @@ -189,7 +195,30 @@ struct imsm_dev {
>  	__u16 cache_policy;
>  	__u8  cng_state;
>  	__u8  cng_sub_state;
> -#define IMSM_DEV_FILLERS 10
> +	__u16 my_vol_raid_dev_num; /* Used in Unique volume Id for this RaidDev */
> +
> +	/* NVM_EN */
> +	__u8 nv_cache_mode;
> +	union {
> +		__u8 nv_cache_flags;
> +		struct {
> +		    __u8 dirty:1; /* 1 - cache is dirty, 0 - clean */
> +		    __u8 nvc_health:2;
> +		    __u8 expansion_bytes:5;
> +		} nvCache;
> +	};

This sets off alarm bells here - you declare a union of a u8 and a
matching bitfield, but do not handle bit endian bitfields. If someone
tried to use this on a big endian system this could get messy.

> @@ -7565,12 +7594,15 @@ mark_checkpoint:
>  
>  skip_mark_checkpoint:
>  	/* mark dirty / clean */
> -	if (dev->vol.dirty != !consistent) {
> +	if ((dev->vol.dirty & RAIDVOL_DIRTY) != !consistent) {

This part makes my head spin (to be honest, the original code did
too). I had to go write a small snipped of C to figure out what the
compiler does with !x given x > 0, since consistent can be 0, 1, and 2
here.

Basically for RAIDVOL_CLEAN and consistent = 0, and RAIDVOL_DIRTY and
consistent = 1 and 2, this statement triggers. Maybe I am just terrible
dense when it comes to reading obfuscated C, but could we change this
round to a construct that is a little easier to read?

Cheers,
Jes

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/7] imsm: PPL support
  2016-11-24 12:29 ` [PATCH 3/7] imsm: PPL support Artur Paszkiewicz
@ 2016-11-28 23:51   ` Jes Sorensen
  2016-11-29 11:02     ` Artur Paszkiewicz
  0 siblings, 1 reply; 17+ messages in thread
From: Jes Sorensen @ 2016-11-28 23:51 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: linux-raid

Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
> Enable creating and assembling IMSM raid5 arrays with PPL.
>
> Write the IMSM MPB location for a device to the newly added rdev
> sb_start sysfs attribute and 'journal_ppl' to 'state' attribute for
> every active member.
>
> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
> ---
>  mdadm.h       |  1 +
>  super-intel.c | 33 +++++++++++++++++++++++++++++++++
>  sysfs.c       |  4 ++++
>  3 files changed, 38 insertions(+)
>
> diff --git a/mdadm.h b/mdadm.h
> index 4eabf59..5600341 100755
> --- a/mdadm.h
> +++ b/mdadm.h
> @@ -252,6 +252,7 @@ struct mdinfo {
>  	unsigned long long	custom_array_size; /* size for non-default sized
>  						    * arrays (in sectors)
>  						    */
> +	unsigned long long	sb_start;
>  #define NO_RESHAPE		0
>  #define VOLUME_RESHAPE		1
>  #define CONTAINER_RESHAPE	2
> diff --git a/super-intel.c b/super-intel.c
> index df09272..79a3d78 100644
> --- a/super-intel.c
> +++ b/super-intel.c
> @@ -1261,6 +1261,15 @@ static void print_imsm_dev(struct intel_super *super,
>  	}
>  	printf("\n");
>  	printf("    Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean");
> +	printf("     RWH Policy : ");
> +	if (dev->rwh_policy == RWH_OFF)
> +		printf("off\n");
> +	else if (dev->rwh_policy == RWH_DISTRIBUTED)
> +		printf("PPL distributed\n");
> +	else if (dev->rwh_policy == RWH_JOURNALING_DRIVE)
> +		printf("PPL journaling drive\n");
> +	else
> +		printf("<unknown:%d>\n", dev->rwh_policy);
>  }
>  
>  static void print_imsm_disk(struct imsm_disk *disk, int index, __u32 reserved)
> @@ -3043,6 +3052,15 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>  			}
>  		}
>  	}
> +
> +	if (info->array.level == 5) {
> +		if (dev->rwh_policy == RWH_OFF)
> +			info->rwh_policy = RWH_POLICY_OFF;
> +		else if (dev->rwh_policy == RWH_DISTRIBUTED)
> +			info->rwh_policy = RWH_POLICY_PPL;
> +		else
> +			info->rwh_policy = RWH_POLICY_UNKNOWN;
> +	}
>  }
>  
>  static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev,
> @@ -3177,6 +3195,9 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *
>  
>  		disk = &super->disks->disk;
>  		info->data_offset = total_blocks(&super->disks->disk) - reserved;
> +		/* mpb anchor sector - see store_imsm_mpb() */
> +		info->sb_start = total_blocks(&super->disks->disk) -
> +				 ((2 * super->sector_size) >> 9);
>  		info->component_size = reserved;
>  		info->disk.state  = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0;
>  		/* we don't change info->disk.raid_disk here because

Hi Artur,

I have been sitting staring at the above for a while, and looking at
store_imsm_mpb() it is not clear to me what is meant to happen here.

For 4k sector drives, aren't you pushing back sb_start way further than
you are for 512 byte sector drives? Ie. you are subtracting 16 sectors
for the 4k drive but only two sectors for the 512 byte sector drive?

Maybe it's because it's Monday or I lost the last of my marbles, but
could you possibly enlighten me here please?

Thanks,
Jes

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/7] imsm: metadata changes for PPL
  2016-11-28 23:27   ` Jes Sorensen
@ 2016-11-29 10:47     ` Artur Paszkiewicz
  2016-11-29 15:21       ` Jes Sorensen
  0 siblings, 1 reply; 17+ messages in thread
From: Artur Paszkiewicz @ 2016-11-29 10:47 UTC (permalink / raw)
  To: Jes Sorensen; +Cc: linux-raid

On 11/29/2016 12:27 AM, Jes Sorensen wrote:
> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>> Updates for the IMSM metadata format, including PPL header structures.
>>
>> Extend imsm_vol dirty field adding a third value, which is required to
>> enable PPL recovery in Windows and UEFI drivers.
>>
>> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
>> ---
>>  super-intel.c | 48 ++++++++++++++++++++++++++++++++++++++++--------
>>  1 file changed, 40 insertions(+), 8 deletions(-)
> 
> A couple of comments on this change:
> 
>> diff --git a/super-intel.c b/super-intel.c
>> index 5740088..69f6201 100644
>> --- a/super-intel.c
>> +++ b/super-intel.c
>> @@ -154,6 +157,9 @@ struct imsm_vol {
>>  #define MIGR_STATE_CHANGE 4
>>  #define MIGR_REPAIR 5
>>  	__u8  migr_type;	/* Initializing, Rebuilding, ... */
>> +#define RAIDVOL_CLEAN          0
>> +#define RAIDVOL_DIRTY          1
>> +#define RAIDVOL_DSRECORD_VALID 2
>>  	__u8  dirty;
>>  	__u8  fs_state;		/* fast-sync state for CnG (0xff == disabled) */
>>  	__u16 verify_errors;	/* number of mismatches */
>> @@ -189,7 +195,30 @@ struct imsm_dev {
>>  	__u16 cache_policy;
>>  	__u8  cng_state;
>>  	__u8  cng_sub_state;
>> -#define IMSM_DEV_FILLERS 10
>> +	__u16 my_vol_raid_dev_num; /* Used in Unique volume Id for this RaidDev */
>> +
>> +	/* NVM_EN */
>> +	__u8 nv_cache_mode;
>> +	union {
>> +		__u8 nv_cache_flags;
>> +		struct {
>> +		    __u8 dirty:1; /* 1 - cache is dirty, 0 - clean */
>> +		    __u8 nvc_health:2;
>> +		    __u8 expansion_bytes:5;
>> +		} nvCache;
>> +	};
> 
> This sets off alarm bells here - you declare a union of a u8 and a
> matching bitfield, but do not handle bit endian bitfields. If someone
> tried to use this on a big endian system this could get messy.

Those fields are not used in the code at all, the only thing added to
this structure that is used is 'rwh_policy'. The rest is to fill the
gaps between IMSM format in UEFI/Windows.

> 
>> @@ -7565,12 +7594,15 @@ mark_checkpoint:
>>  
>>  skip_mark_checkpoint:
>>  	/* mark dirty / clean */
>> -	if (dev->vol.dirty != !consistent) {
>> +	if ((dev->vol.dirty & RAIDVOL_DIRTY) != !consistent) {
> 
> This part makes my head spin (to be honest, the original code did
> too). I had to go write a small snipped of C to figure out what the
> compiler does with !x given x > 0, since consistent can be 0, 1, and 2
> here.
> 
> Basically for RAIDVOL_CLEAN and consistent = 0, and RAIDVOL_DIRTY and
> consistent = 1 and 2, this statement triggers. Maybe I am just terrible
> dense when it comes to reading obfuscated C, but could we change this
> round to a construct that is a little easier to read?

Yes, I admit it is confusing. Does this look better?

-       if (dev->vol.dirty != !consistent) {
+       if (((dev->vol.dirty & RAIDVOL_DIRTY) && consistent) ||
+           (!(dev->vol.dirty & RAIDVOL_DIRTY) && !consistent)) {

Thanks,
Artur


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/7] imsm: PPL support
  2016-11-28 23:51   ` Jes Sorensen
@ 2016-11-29 11:02     ` Artur Paszkiewicz
  2016-11-29 15:23       ` Jes Sorensen
  0 siblings, 1 reply; 17+ messages in thread
From: Artur Paszkiewicz @ 2016-11-29 11:02 UTC (permalink / raw)
  To: Jes Sorensen; +Cc: linux-raid

On 11/29/2016 12:51 AM, Jes Sorensen wrote:
> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>> Enable creating and assembling IMSM raid5 arrays with PPL.
>>
>> Write the IMSM MPB location for a device to the newly added rdev
>> sb_start sysfs attribute and 'journal_ppl' to 'state' attribute for
>> every active member.
>>
>> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
>> ---
>>  mdadm.h       |  1 +
>>  super-intel.c | 33 +++++++++++++++++++++++++++++++++
>>  sysfs.c       |  4 ++++
>>  3 files changed, 38 insertions(+)
>>
>> diff --git a/mdadm.h b/mdadm.h
>> index 4eabf59..5600341 100755
>> --- a/mdadm.h
>> +++ b/mdadm.h
>> @@ -252,6 +252,7 @@ struct mdinfo {
>>  	unsigned long long	custom_array_size; /* size for non-default sized
>>  						    * arrays (in sectors)
>>  						    */
>> +	unsigned long long	sb_start;
>>  #define NO_RESHAPE		0
>>  #define VOLUME_RESHAPE		1
>>  #define CONTAINER_RESHAPE	2
>> diff --git a/super-intel.c b/super-intel.c
>> index df09272..79a3d78 100644
>> --- a/super-intel.c
>> +++ b/super-intel.c
>> @@ -1261,6 +1261,15 @@ static void print_imsm_dev(struct intel_super *super,
>>  	}
>>  	printf("\n");
>>  	printf("    Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean");
>> +	printf("     RWH Policy : ");
>> +	if (dev->rwh_policy == RWH_OFF)
>> +		printf("off\n");
>> +	else if (dev->rwh_policy == RWH_DISTRIBUTED)
>> +		printf("PPL distributed\n");
>> +	else if (dev->rwh_policy == RWH_JOURNALING_DRIVE)
>> +		printf("PPL journaling drive\n");
>> +	else
>> +		printf("<unknown:%d>\n", dev->rwh_policy);
>>  }
>>  
>>  static void print_imsm_disk(struct imsm_disk *disk, int index, __u32 reserved)
>> @@ -3043,6 +3052,15 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>  			}
>>  		}
>>  	}
>> +
>> +	if (info->array.level == 5) {
>> +		if (dev->rwh_policy == RWH_OFF)
>> +			info->rwh_policy = RWH_POLICY_OFF;
>> +		else if (dev->rwh_policy == RWH_DISTRIBUTED)
>> +			info->rwh_policy = RWH_POLICY_PPL;
>> +		else
>> +			info->rwh_policy = RWH_POLICY_UNKNOWN;
>> +	}
>>  }
>>  
>>  static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev,
>> @@ -3177,6 +3195,9 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *
>>  
>>  		disk = &super->disks->disk;
>>  		info->data_offset = total_blocks(&super->disks->disk) - reserved;
>> +		/* mpb anchor sector - see store_imsm_mpb() */
>> +		info->sb_start = total_blocks(&super->disks->disk) -
>> +				 ((2 * super->sector_size) >> 9);
>>  		info->component_size = reserved;
>>  		info->disk.state  = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0;
>>  		/* we don't change info->disk.raid_disk here because
> 
> Hi Artur,
> 
> I have been sitting staring at the above for a while, and looking at
> store_imsm_mpb() it is not clear to me what is meant to happen here.
> 
> For 4k sector drives, aren't you pushing back sb_start way further than
> you are for 512 byte sector drives? Ie. you are subtracting 16 sectors
> for the 4k drive but only two sectors for the 512 byte sector drive?
> 
> Maybe it's because it's Monday or I lost the last of my marbles, but
> could you possibly enlighten me here please?

Jes,

You read it correctly. The reason for this is that the IMSM anchor is
located in the second _logical_ sector from the end of the drive. So for
4k drives this will be 16 512-byte sectors from the end.

Thanks,
Artur


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/7] imsm: metadata changes for PPL
  2016-11-29 10:47     ` Artur Paszkiewicz
@ 2016-11-29 15:21       ` Jes Sorensen
  2016-11-30  7:34         ` Artur Paszkiewicz
  0 siblings, 1 reply; 17+ messages in thread
From: Jes Sorensen @ 2016-11-29 15:21 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: linux-raid

Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
> On 11/29/2016 12:27 AM, Jes Sorensen wrote:
>> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>>> @@ -189,7 +195,30 @@ struct imsm_dev {
>>>  	__u16 cache_policy;
>>>  	__u8  cng_state;
>>>  	__u8  cng_sub_state;
>>> -#define IMSM_DEV_FILLERS 10
>>> +	__u16 my_vol_raid_dev_num; /* Used in Unique volume Id for this RaidDev */
>>> +
>>> +	/* NVM_EN */
>>> +	__u8 nv_cache_mode;
>>> +	union {
>>> +		__u8 nv_cache_flags;
>>> +		struct {
>>> +		    __u8 dirty:1; /* 1 - cache is dirty, 0 - clean */
>>> +		    __u8 nvc_health:2;
>>> +		    __u8 expansion_bytes:5;
>>> +		} nvCache;
>>> +	};
>> 
>> This sets off alarm bells here - you declare a union of a u8 and a
>> matching bitfield, but do not handle bit endian bitfields. If someone
>> tried to use this on a big endian system this could get messy.
>
> Those fields are not used in the code at all, the only thing added to
> this structure that is used is 'rwh_policy'. The rest is to fill the
> gaps between IMSM format in UEFI/Windows.

Hi Artur,

I did notice the code wasn't actually used, sorry I didn't mention
that. However I would still prefer to see at least a comment in the code
indicating that this would fail on BE systems.

>>> @@ -7565,12 +7594,15 @@ mark_checkpoint:
>>>  
>>>  skip_mark_checkpoint:
>>>  	/* mark dirty / clean */
>>> -	if (dev->vol.dirty != !consistent) {
>>> +	if ((dev->vol.dirty & RAIDVOL_DIRTY) != !consistent) {
>> 
>> This part makes my head spin (to be honest, the original code did
>> too). I had to go write a small snipped of C to figure out what the
>> compiler does with !x given x > 0, since consistent can be 0, 1, and 2
>> here.
>> 
>> Basically for RAIDVOL_CLEAN and consistent = 0, and RAIDVOL_DIRTY and
>> consistent = 1 and 2, this statement triggers. Maybe I am just terrible
>> dense when it comes to reading obfuscated C, but could we change this
>> round to a construct that is a little easier to read?
>
> Yes, I admit it is confusing. Does this look better?
>
> -       if (dev->vol.dirty != !consistent) {
> +       if (((dev->vol.dirty & RAIDVOL_DIRTY) && consistent) ||
> +           (!(dev->vol.dirty & RAIDVOL_DIRTY) && !consistent)) {

This I can read without getting headache, so yes :)

Cheers,
Jes

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/7] imsm: PPL support
  2016-11-29 11:02     ` Artur Paszkiewicz
@ 2016-11-29 15:23       ` Jes Sorensen
  2016-11-30  8:07         ` Artur Paszkiewicz
  0 siblings, 1 reply; 17+ messages in thread
From: Jes Sorensen @ 2016-11-29 15:23 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: linux-raid

Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
> On 11/29/2016 12:51 AM, Jes Sorensen wrote:
>>> @@ -3177,6 +3195,9 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *
>>>  
>>>  		disk = &super->disks->disk;
>>>  		info->data_offset = total_blocks(&super->disks->disk) - reserved;
>>> +		/* mpb anchor sector - see store_imsm_mpb() */
>>> +		info->sb_start = total_blocks(&super->disks->disk) -
>>> +				 ((2 * super->sector_size) >> 9);
>>>  		info->component_size = reserved;
>>>  		info->disk.state  = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0;
>>>  		/* we don't change info->disk.raid_disk here because
>> 
>> Hi Artur,
>> 
>> I have been sitting staring at the above for a while, and looking at
>> store_imsm_mpb() it is not clear to me what is meant to happen here.
>> 
>> For 4k sector drives, aren't you pushing back sb_start way further than
>> you are for 512 byte sector drives? Ie. you are subtracting 16 sectors
>> for the 4k drive but only two sectors for the 512 byte sector drive?
>> 
>> Maybe it's because it's Monday or I lost the last of my marbles, but
>> could you possibly enlighten me here please?
>
> Jes,
>
> You read it correctly. The reason for this is that the IMSM anchor is
> located in the second _logical_ sector from the end of the drive. So for
> 4k drives this will be 16 512-byte sectors from the end.

I see, so the IMSM implementation uses 512 byte logical sectors on top
of 4k drives? Could I ask you to add a note explaining this in the code?

Thanks,
Jes

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/7] imsm: metadata changes for PPL
  2016-11-29 15:21       ` Jes Sorensen
@ 2016-11-30  7:34         ` Artur Paszkiewicz
  0 siblings, 0 replies; 17+ messages in thread
From: Artur Paszkiewicz @ 2016-11-30  7:34 UTC (permalink / raw)
  To: Jes Sorensen; +Cc: linux-raid

On 11/29/2016 04:21 PM, Jes Sorensen wrote:
> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>> On 11/29/2016 12:27 AM, Jes Sorensen wrote:
>>> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>>>> @@ -189,7 +195,30 @@ struct imsm_dev {
>>>>  	__u16 cache_policy;
>>>>  	__u8  cng_state;
>>>>  	__u8  cng_sub_state;
>>>> -#define IMSM_DEV_FILLERS 10
>>>> +	__u16 my_vol_raid_dev_num; /* Used in Unique volume Id for this RaidDev */
>>>> +
>>>> +	/* NVM_EN */
>>>> +	__u8 nv_cache_mode;
>>>> +	union {
>>>> +		__u8 nv_cache_flags;
>>>> +		struct {
>>>> +		    __u8 dirty:1; /* 1 - cache is dirty, 0 - clean */
>>>> +		    __u8 nvc_health:2;
>>>> +		    __u8 expansion_bytes:5;
>>>> +		} nvCache;
>>>> +	};
>>>
>>> This sets off alarm bells here - you declare a union of a u8 and a
>>> matching bitfield, but do not handle bit endian bitfields. If someone
>>> tried to use this on a big endian system this could get messy.
>>
>> Those fields are not used in the code at all, the only thing added to
>> this structure that is used is 'rwh_policy'. The rest is to fill the
>> gaps between IMSM format in UEFI/Windows.
> 
> Hi Artur,
> 
> I did notice the code wasn't actually used, sorry I didn't mention
> that. However I would still prefer to see at least a comment in the code
> indicating that this would fail on BE systems.

Since I wasn't going to use those fields I had not given this much
thought, but you are right - this is definitely not portable. I think
I'll remove those bitfields in the next version.

Thanks,
Artur

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/7] imsm: PPL support
  2016-11-29 15:23       ` Jes Sorensen
@ 2016-11-30  8:07         ` Artur Paszkiewicz
  2016-11-30 15:40           ` Jes Sorensen
  0 siblings, 1 reply; 17+ messages in thread
From: Artur Paszkiewicz @ 2016-11-30  8:07 UTC (permalink / raw)
  To: Jes Sorensen; +Cc: linux-raid

On 11/29/2016 04:23 PM, Jes Sorensen wrote:
> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>> On 11/29/2016 12:51 AM, Jes Sorensen wrote:
>>>> @@ -3177,6 +3195,9 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *
>>>>  
>>>>  		disk = &super->disks->disk;
>>>>  		info->data_offset = total_blocks(&super->disks->disk) - reserved;
>>>> +		/* mpb anchor sector - see store_imsm_mpb() */
>>>> +		info->sb_start = total_blocks(&super->disks->disk) -
>>>> +				 ((2 * super->sector_size) >> 9);
>>>>  		info->component_size = reserved;
>>>>  		info->disk.state  = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0;
>>>>  		/* we don't change info->disk.raid_disk here because
>>>
>>> Hi Artur,
>>>
>>> I have been sitting staring at the above for a while, and looking at
>>> store_imsm_mpb() it is not clear to me what is meant to happen here.
>>>
>>> For 4k sector drives, aren't you pushing back sb_start way further than
>>> you are for 512 byte sector drives? Ie. you are subtracting 16 sectors
>>> for the 4k drive but only two sectors for the 512 byte sector drive?
>>>
>>> Maybe it's because it's Monday or I lost the last of my marbles, but
>>> could you possibly enlighten me here please?
>>
>> Jes,
>>
>> You read it correctly. The reason for this is that the IMSM anchor is
>> located in the second _logical_ sector from the end of the drive. So for
>> 4k drives this will be 16 512-byte sectors from the end.
> 
> I see, so the IMSM implementation uses 512 byte logical sectors on top
> of 4k drives? Could I ask you to add a note explaining this in the code?

IMSM uses logical (4k or 512b) sector sizes in the metadata, but mdadm
implementation uses just 512 byte sectors. This is how it works since
Pawel's 4k support patches - it converts 4k metadata internally to 512b
sector values. Plus here the sb_start value is passed to the kernel, so
it must be in 512 byte sectors. Sure, I can add a comment about this.

Thanks,
Artur

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/7] imsm: PPL support
  2016-11-30  8:07         ` Artur Paszkiewicz
@ 2016-11-30 15:40           ` Jes Sorensen
  0 siblings, 0 replies; 17+ messages in thread
From: Jes Sorensen @ 2016-11-30 15:40 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: linux-raid

Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
> On 11/29/2016 04:23 PM, Jes Sorensen wrote:
>> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>>> On 11/29/2016 12:51 AM, Jes Sorensen wrote:
>>>>> @@ -3177,6 +3195,9 @@ static void getinfo_super_imsm(struct
>>>>> supertype *st, struct mdinfo *info, char *
>>>>>  
>>>>>  		disk = &super->disks->disk;
>>>>>  		info->data_offset = total_blocks(&super->disks->disk)
>>>>> - reserved;
>>>>> +		/* mpb anchor sector - see store_imsm_mpb() */
>>>>> +		info->sb_start = total_blocks(&super->disks->disk) -
>>>>> +				 ((2 * super->sector_size) >> 9);
>>>>>  		info->component_size = reserved;
>>>>>  		info->disk.state = is_configured(disk) ? (1 <<
>>>>> MD_DISK_ACTIVE) : 0;
>>>>>  		/* we don't change info->disk.raid_disk here because
>>>>
>>>> Hi Artur,
>>>>
>>>> I have been sitting staring at the above for a while, and looking at
>>>> store_imsm_mpb() it is not clear to me what is meant to happen here.
>>>>
>>>> For 4k sector drives, aren't you pushing back sb_start way further than
>>>> you are for 512 byte sector drives? Ie. you are subtracting 16 sectors
>>>> for the 4k drive but only two sectors for the 512 byte sector drive?
>>>>
>>>> Maybe it's because it's Monday or I lost the last of my marbles, but
>>>> could you possibly enlighten me here please?
>>>
>>> Jes,
>>>
>>> You read it correctly. The reason for this is that the IMSM anchor is
>>> located in the second _logical_ sector from the end of the drive. So for
>>> 4k drives this will be 16 512-byte sectors from the end.
>> 
>> I see, so the IMSM implementation uses 512 byte logical sectors on top
>> of 4k drives? Could I ask you to add a note explaining this in the code?
>
> IMSM uses logical (4k or 512b) sector sizes in the metadata, but mdadm
> implementation uses just 512 byte sectors. This is how it works since
> Pawel's 4k support patches - it converts 4k metadata internally to 512b
> sector values. Plus here the sb_start value is passed to the kernel, so
> it must be in 512 byte sectors. Sure, I can add a comment about this.

Great, I prefer to have it documented so nobody else tries to pull all
their hairs out trying to understand it :)

Thanks,
Jes

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2016-11-30 15:40 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-24 12:29 [PATCH 0/7] mdadm support for Partial Parity Log Artur Paszkiewicz
2016-11-24 12:29 ` [PATCH 1/7] imsm: metadata changes for PPL Artur Paszkiewicz
2016-11-28 23:27   ` Jes Sorensen
2016-11-29 10:47     ` Artur Paszkiewicz
2016-11-29 15:21       ` Jes Sorensen
2016-11-30  7:34         ` Artur Paszkiewicz
2016-11-24 12:29 ` [PATCH 2/7] Generic support for --rwh-policy and PPL Artur Paszkiewicz
2016-11-24 12:29 ` [PATCH 3/7] imsm: PPL support Artur Paszkiewicz
2016-11-28 23:51   ` Jes Sorensen
2016-11-29 11:02     ` Artur Paszkiewicz
2016-11-29 15:23       ` Jes Sorensen
2016-11-30  8:07         ` Artur Paszkiewicz
2016-11-30 15:40           ` Jes Sorensen
2016-11-24 12:29 ` [PATCH 4/7] super1: " Artur Paszkiewicz
2016-11-24 12:29 ` [PATCH 5/7] imsm: allow to assemble with PPL even if dirty degraded Artur Paszkiewicz
2016-11-24 12:29 ` [PATCH 6/7] Allow changing the RWH policy for a running array Artur Paszkiewicz
2016-11-24 12:29 ` [PATCH 7/7] Man page changes for --rwh-policy Artur Paszkiewicz

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.