All of lore.kernel.org
 help / color / mirror / Atom feed
From: Logan Gunthorpe <logang@deltatee.com>
To: linux-raid@vger.kernel.org, Jes Sorensen <jes@trained-monkey.org>
Cc: Guoqing Jiang <guoqing.jiang@linux.dev>, Xiao Ni <xni@redhat.com>,
	Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>,
	Coly Li <colyli@suse.de>,
	Chaitanya Kulkarni <chaitanyak@nvidia.com>,
	Jonmichael Hands <jm@chia.net>,
	Stephen Bates <sbates@raithlin.com>,
	Martin Oliveira <Martin.Oliveira@eideticom.com>,
	David Sloan <David.Sloan@eideticom.com>,
	Logan Gunthorpe <logang@deltatee.com>
Subject: [PATCH mdadm v4 5/7] mdadm: Add --write-zeros option for Create
Date: Fri,  7 Oct 2022 14:10:35 -0600	[thread overview]
Message-ID: <20221007201037.20263-6-logang@deltatee.com> (raw)
In-Reply-To: <20221007201037.20263-1-logang@deltatee.com>

Add the --write-zeros option for Create which will send a write zeros
request to all the disks before assembling the array. After zeroing
the array, the disks will be in a known clean state and the initial
sync may be skipped.

Writing zeroes is best used when there is a hardware offload method
to zero the data. But even still, zeroing can take several minutes on
a large device. Because of this, all disks are zeroed in parallel using
their own forked process and a message is printed to the user. The main
process will proceed only after all the zeroing processes have completed
successfully.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>

fixup! mdadm: Add --write-zeros option for Create
---
 Create.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 ReadMe.c |  2 ++
 mdadm.c  |  9 ++++++
 mdadm.h  |  5 ++++
 4 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/Create.c b/Create.c
index 4acda30c5256..36d63244f9ad 100644
--- a/Create.c
+++ b/Create.c
@@ -26,6 +26,8 @@
 #include	"md_u.h"
 #include	"md_p.h"
 #include	<ctype.h>
+#include	<fcntl.h>
+#include	<sys/wait.h>
 
 static int round_size_and_verify(unsigned long long *size, int chunk)
 {
@@ -91,9 +93,77 @@ int default_layout(struct supertype *st, int level, int verbose)
 	return layout;
 }
 
+static pid_t write_zeroes_fork(int fd, struct shape *s, struct supertype *st,
+			       struct mddev_dev *dv)
+
+{
+	unsigned long long offset_bytes, size_bytes;
+	int ret = 0;
+	pid_t pid;
+
+	size_bytes = KIB_TO_BYTES(s->size);
+
+	/*
+	 * If size_bytes is zero, this is a zoned raid array where
+	 * each disk is of a different size and uses its full
+	 * disk. Thus zero the entire disk.
+	 */
+	if (!size_bytes && !get_dev_size(fd, dv->devname, &size_bytes))
+		return -1;
+
+	if (dv->data_offset != INVALID_SECTORS)
+		offset_bytes = SEC_TO_BYTES(dv->data_offset);
+	else
+		offset_bytes = SEC_TO_BYTES(st->data_offset);
+
+	pr_info("zeroing data from %lld to %lld on: %s\n",
+		offset_bytes, size_bytes, dv->devname);
+
+	pid = fork();
+	if (pid < 0) {
+		pr_err("Could not fork to zero disks: %m\n");
+		return pid;
+	} else if (pid != 0) {
+		return pid;
+	}
+
+	if (fallocate(fd, FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE,
+		      offset_bytes, size_bytes)) {
+		pr_err("zeroing %s failed: %m\n", dv->devname);
+		ret = 1;
+	}
+
+	exit(ret);
+}
+
+static int wait_for_zero_forks(int *zero_pids, int count)
+{
+	int wstatus, ret = 0, i;
+	bool waited = false;
+
+	for (i = 0; i < count; i++) {
+		if (!zero_pids[i])
+			continue;
+
+		waited = true;
+		waitpid(zero_pids[i], &wstatus, 0);
+
+		if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus))
+			ret = 1;
+	}
+
+	if (ret)
+		pr_err("zeroing failed!\n");
+	else if (waited)
+		pr_info("zeroing finished\n");
+
+	return ret;
+}
+
 static int add_disk_to_super(int mdfd, struct shape *s, struct context *c,
 		struct supertype *st, struct mddev_dev *dv,
-		struct mdinfo *info, int have_container, int major_num)
+		struct mdinfo *info, int have_container, int major_num,
+		int *zero_pid)
 {
 	dev_t rdev;
 	int fd;
@@ -148,6 +218,14 @@ static int add_disk_to_super(int mdfd, struct shape *s, struct context *c,
 	}
 	st->ss->getinfo_super(st, info, NULL);
 
+	if (fd >= 0 && s->write_zeroes) {
+		*zero_pid = write_zeroes_fork(fd, s, st, dv);
+		if (*zero_pid <= 0) {
+			ioctl(mdfd, STOP_ARRAY, NULL);
+			return 1;
+		}
+	}
+
 	if (have_container && c->verbose > 0)
 		pr_err("Using %s for device %d\n",
 		       map_dev(info->disk.major, info->disk.minor, 0),
@@ -224,10 +302,12 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
 {
 	struct mddev_dev *moved_disk = NULL;
 	int pass, raid_disk_num, dnum;
+	int zero_pids[total_slots];
 	struct mddev_dev *dv;
 	struct mdinfo *infos;
 	int ret = 0;
 
+	memset(zero_pids, 0, sizeof(zero_pids));
 	infos = xmalloc(sizeof(*infos) * total_slots);
 	enable_fds(total_slots);
 	for (pass = 1; pass <= 2; pass++) {
@@ -261,7 +341,7 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
 
 				ret = add_disk_to_super(mdfd, s, c, st, dv,
 						&infos[dnum], have_container,
-						major_num);
+						major_num, &zero_pids[dnum]);
 				if (ret)
 					goto out;
 
@@ -287,6 +367,10 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
 		}
 
 		if (pass == 1) {
+			ret = wait_for_zero_forks(zero_pids, total_slots);
+			if (ret)
+				goto out;
+
 			ret = update_metadata(mdfd, s, st, map, info,
 					      chosen_name);
 			if (ret)
@@ -295,6 +379,8 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
 	}
 
 out:
+	if (ret)
+		wait_for_zero_forks(zero_pids, total_slots);
 	free(infos);
 	return ret;
 }
diff --git a/ReadMe.c b/ReadMe.c
index 50a5e36d05fc..9424bfc3eeca 100644
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -138,6 +138,7 @@ struct option long_options[] = {
     {"size",	  1, 0, 'z'},
     {"auto",	  1, 0, Auto}, /* also for --assemble */
     {"assume-clean",0,0, AssumeClean },
+    {"write-zeroes",0,0, WriteZeroes },
     {"metadata",  1, 0, 'e'}, /* superblock format */
     {"bitmap",	  1, 0, Bitmap},
     {"bitmap-chunk", 1, 0, BitmapChunk},
@@ -390,6 +391,7 @@ char Help_create[] =
 "  --write-journal=      : Specify journal device for RAID-4/5/6 array\n"
 "  --consistency-policy= : Specify the policy that determines how the array\n"
 "                     -k : maintains consistency in case of unexpected shutdown.\n"
+"  --write-zeroes        : Write zeroes to the disks before creating. This will bypass initial sync.\n"
 "\n"
 ;
 
diff --git a/mdadm.c b/mdadm.c
index 972adb524dfb..141838bd394f 100644
--- a/mdadm.c
+++ b/mdadm.c
@@ -602,6 +602,10 @@ int main(int argc, char *argv[])
 			s.assume_clean = 1;
 			continue;
 
+		case O(CREATE, WriteZeroes):
+			s.write_zeroes = 1;
+			continue;
+
 		case O(GROW,'n'):
 		case O(CREATE,'n'):
 		case O(BUILD,'n'): /* number of raid disks */
@@ -1306,6 +1310,11 @@ int main(int argc, char *argv[])
 		}
 	}
 
+	if (s.write_zeroes && !s.assume_clean) {
+		pr_info("Disk zeroing requested, setting --assume-clean to skip resync\n");
+		s.assume_clean = 1;
+	}
+
 	if (!mode && devs_found) {
 		mode = MISC;
 		devmode = 'Q';
diff --git a/mdadm.h b/mdadm.h
index 18c24915e94c..82e920fb523a 100644
--- a/mdadm.h
+++ b/mdadm.h
@@ -273,6 +273,9 @@ static inline void __put_unaligned32(__u32 val, void *p)
 
 #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
 
+#define KIB_TO_BYTES(x)	((x) << 10)
+#define SEC_TO_BYTES(x)	((x) << 9)
+
 extern const char Name[];
 
 struct md_bb_entry {
@@ -433,6 +436,7 @@ extern char Version[], Usage[], Help[], OptionHelp[],
  */
 enum special_options {
 	AssumeClean = 300,
+	WriteZeroes,
 	BitmapChunk,
 	WriteBehind,
 	ReAdd,
@@ -593,6 +597,7 @@ struct shape {
 	int	bitmap_chunk;
 	char	*bitmap_file;
 	int	assume_clean;
+	bool	write_zeroes;
 	int	write_behind;
 	unsigned long long size;
 	unsigned long long data_offset;
-- 
2.30.2


  parent reply	other threads:[~2022-10-07 20:11 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-10-07 20:10 [PATCH mdadm v4 0/7] Write Zeroes option for Creating Arrays Logan Gunthorpe
2022-10-07 20:10 ` [PATCH mdadm v4 1/7] Create: goto abort_locked instead of return 1 in error path Logan Gunthorpe
2022-10-07 20:10 ` [PATCH mdadm v4 2/7] Create: remove safe_mode_delay local variable Logan Gunthorpe
2022-10-07 20:10 ` [PATCH mdadm v4 3/7] Create: Factor out add_disks() helpers Logan Gunthorpe
2022-10-07 20:10 ` [PATCH mdadm v4 4/7] mdadm: Introduce pr_info() Logan Gunthorpe
2022-10-07 20:10 ` Logan Gunthorpe [this message]
2022-10-07 20:10 ` [PATCH mdadm v4 6/7] tests/00raid5-zero: Introduce test to exercise --write-zeros Logan Gunthorpe
2022-10-07 20:10 ` [PATCH mdadm v4 7/7] manpage: Add --write-zeroes option to manpage Logan Gunthorpe
2022-10-12  1:09 ` [PATCH mdadm v4 0/7] Write Zeroes option for Creating Arrays Xiao Ni
2022-10-12 16:59   ` Logan Gunthorpe
2022-10-13  1:33     ` Martin K. Petersen
2022-10-13  7:51       ` Xiao Ni
2022-10-26  2:41         ` Martin K. Petersen
2022-10-27  8:44           ` Xiao Ni
2022-11-16 17:11       ` Logan Gunthorpe
2022-11-03  8:14 ` Kinga Tanska

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221007201037.20263-6-logang@deltatee.com \
    --to=logang@deltatee.com \
    --cc=David.Sloan@eideticom.com \
    --cc=Martin.Oliveira@eideticom.com \
    --cc=chaitanyak@nvidia.com \
    --cc=colyli@suse.de \
    --cc=guoqing.jiang@linux.dev \
    --cc=jes@trained-monkey.org \
    --cc=jm@chia.net \
    --cc=linux-raid@vger.kernel.org \
    --cc=mariusz.tkaczyk@linux.intel.com \
    --cc=sbates@raithlin.com \
    --cc=xni@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.