All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][WIP][PATCH] fsck -l and lvm/dmcrypt/md/etc
@ 2016-04-08 21:40 Yuriy M. Kaminskiy
  2016-04-18 10:59 ` Karel Zak
  0 siblings, 1 reply; 4+ messages in thread
From: Yuriy M. Kaminskiy @ 2016-04-08 21:40 UTC (permalink / raw)
  To: util-linux

[-- Attachment #1: Type: text/plain, Size: 704 bytes --]

I think this algorithm should work:

1) walk /dev/block/${dev}/slaves recursively (e.g. lvm-over- luks-
over-lvm- over-md, first-level slaves won't work), collect all
whole-disk devices;
2) sort this list by device numbers (to avoid AB/BA deadlocks), remove
duplicates;
3) acquire all locks consequently.

There are some unavoidable flaws (e.g. if someone alters structure while fsck
is running), and some things could be improved, but it should work in
most cases (and if it fails, it is just no worse than current behavior).

I've tested with LVM and LUKS-over-LVM on (debian's) 3.16 kernel, it seems
works as expected.

What is not covered: /dev/loop* (and I have no plans for it).

Comments? NACKs?


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: fsck--l-dm-md.patch --]
[-- Type: text/x-diff, Size: 7033 bytes --]

BEWARE: work-in-progress, and it is full of NIH.

diff --git a/disk-utils/fsck.c b/disk-utils/fsck.c
index 84d2dcc..6bd708f 100644
--- a/disk-utils/fsck.c
+++ b/disk-utils/fsck.c
@@ -100,6 +100,10 @@ struct fsck_fs_data
 			eval_device:1;
 };
 
+union disk_lock {
+	dev_t	dev;
+	int	lock;		/* flock()ed file descriptor or -1 */
+};
 /*
  * Structure to allow exit codes to be stored
  */
@@ -107,8 +111,9 @@ struct fsck_instance {
 	int	pid;
 	int	flags;		/* FLAG_{DONE|PROGRESS} */
 
-	int	lock;		/* flock()ed lockpath file descriptor or -1 */
-	char	*lockpath;	/* /run/fsck/<diskname>.lock or NULL */
+
+	union disk_lock *locks, lock;
+	size_t nlocks;
 
 	int	exit_status;
 	struct timeval start_time;
@@ -335,19 +340,25 @@ static int is_irrotational_disk(dev_t disk)
 	return rc == 1 ? !x : 0;
 }
 
+static int compare_locks(const void *a, const void *b)
+{
+	dev_t a_dev = ((const union disk_lock *)a)->dev;
+	dev_t b_dev = ((const union disk_lock *)b)->dev;
+	return (a_dev > b_dev) - (b_dev > a_dev);
+}
+
 static void lock_disk(struct fsck_instance *inst)
 {
 	dev_t disk = fs_get_disk(inst->fs, 1);
 	char *diskpath = NULL, *diskname;
-
-	inst->lock = -1;
+	union disk_lock *locks = NULL;
+	size_t nlocks = 0;
+	size_t i;
+	char path[PATH_MAX];
+	int rc;
 
 	if (!disk || is_irrotational_disk(disk))
-		goto done;
-
-	diskpath = blkid_devno_to_devname(disk);
-	if (!diskpath)
-		goto done;
+		return;
 
 	if (access(FSCK_RUNTIME_DIRNAME, F_OK) != 0) {
 		int rc = mkdir(FSCK_RUNTIME_DIRNAME,
@@ -357,64 +368,151 @@ static void lock_disk(struct fsck_instance *inst)
 		if (rc && errno != EEXIST) {
 			warn(_("cannot create directory %s"),
 					FSCK_RUNTIME_DIRNAME);
-			goto done;
+			return;
 		}
 	}
 
+	if (fs_is_stacked(inst->fs)) {
+		/* XXX this code (gracefully) ignores errors */
+		size_t allocated = 4;
+	       	locks = malloc(allocated*sizeof(union disk_lock));
+		locks[nlocks++].dev = disk;
+		for(i = 0; i < nlocks; i++) {
+			size_t path_end;
+			DIR *dir;
+			struct dirent *dp;
+			size_t j;
+
+			disk = locks[i].dev;
+			rc = snprintf(path, sizeof(path),
+					"/sys/dev/block/%d:%d/slaves",
+					major(disk), minor(disk));
+			if (rc < 0 || (unsigned int) rc >= sizeof(path))
+				continue;
+			path_end = rc;
+			if ((dir = opendir(path)) == NULL)
+				continue;
+			while((dp = readdir(dir)) != NULL) {
+				FILE *f;
+				int maj, min;
+#ifdef _DIRENT_HAVE_D_TYPE
+				if (dp->d_type != DT_UNKNOWN &&
+				    dp->d_type != DT_LNK)
+					continue;
+#endif
+				if (dp->d_name[0] == '.' &&
+				    ((dp->d_name[1] == 0) ||
+				    ((dp->d_name[1] == '.') && (dp->d_name[2] == 0))))
+					continue;
+				rc = snprintf(path + path_end,
+					       sizeof(path) - path_end,
+					       "/%s/dev", dp->d_name);
+				if (rc < 0 || (unsigned int)rc >= sizeof(path) - path_end)
+					continue;
+
+				if ((f = fopen(path, "r")) == NULL)
+					continue;
+				rc = fscanf(f, "%d:%d", &maj, &min);
+				fclose(f);
+				if (rc != 2)
+					continue;
+
+				disk = makedev(maj, min);
+				if (blkid_devno_to_wholedisk(disk, NULL, 0, &disk))
+					continue;;
+				for (j = nlocks; j > 0; j--) {
+					/* XXX O(n^2) */
+					if (disk == locks[j-1].dev)
+						break;
+				}
+				if (j > 0) /* Don't add duplicates */
+					continue;;
+				if (is_irrotational_disk(disk))
+					continue;;
+				if (allocated >= nlocks) {
+					void *p;
+					if (allocated > INT_MAX/(2*sizeof(locks[0])))
+						continue;;
+					p = realloc(locks, allocated*2*sizeof(locks[0]));
+					if (p == NULL)
+						continue;;
+					locks = p;
+					allocated *= 2;
+				}
+				locks[nlocks++].dev = disk;
+			}
+			closedir(dir);
+		}
+		qsort(locks, nlocks, sizeof(locks[0]), compare_locks);
+		locks = realloc(locks, nlocks*sizeof(locks[0]));
+	} else {
+		inst->lock.dev = disk;
+		locks = &inst->lock;
+		nlocks = 1;
+	}
+	inst->locks = locks;
+	inst->nlocks = nlocks;
+
+	for (i = 0; i < nlocks; i++) {
+		disk = locks[i].dev;
+	diskpath = blkid_devno_to_devname(disk);
+	if (!diskpath) {
+		locks[i].lock = -1;
+		continue;
+	}
+	/* XXX is this necessary? E.g. why not replace with ..."/%d_%d.lock", major(), minor()); */
 	diskname = stripoff_last_component(diskpath);
 	if (!diskname)
 		diskname = diskpath;
 
-	xasprintf(&inst->lockpath, FSCK_RUNTIME_DIRNAME "/%s.lock", diskname);
+	rc = snprintf(path, sizeof(path), FSCK_RUNTIME_DIRNAME "/%s.lock", diskname);
+	if (rc < 0 || (unsigned int)rc >= sizeof(path)) {
+		locks[i].lock = -1;
+		free(diskpath);
+		continue;
+	}
 
 	if (verbose)
-		printf(_("Locking disk by %s ... "), inst->lockpath);
+		printf(_("Locking disk %d:%d by %s ... "),
+			major(disk), minor(disk), path);
 
-	inst->lock = open(inst->lockpath, O_RDONLY|O_CREAT|O_CLOEXEC,
+	locks[i].lock = open(path, O_RDONLY|O_CREAT|O_CLOEXEC,
 				    S_IWUSR|S_IRUSR|S_IRGRP|S_IROTH);
-	if (inst->lock >= 0) {
+	if (locks[i].lock >= 0) {
 		int rc = -1;
 
 		/* inform users that we're waiting on the lock */
 		if (verbose &&
-		    (rc = flock(inst->lock, LOCK_EX | LOCK_NB)) != 0 &&
+		    (rc = flock(locks[i].lock, LOCK_EX | LOCK_NB)) != 0 &&
 		    errno == EWOULDBLOCK)
 			printf(_("(waiting) "));
 
-		if (rc != 0 && flock(inst->lock, LOCK_EX) != 0) {
-			close(inst->lock);			/* failed */
-			inst->lock = -1;
+		if (rc != 0 && flock(locks[i].lock, LOCK_EX) != 0) {
+			close(locks[i].lock);			/* failed */
+			locks[i].lock = -1;
 		}
 	}
 
 	if (verbose)
 		/* TRANSLATORS: These are followups to "Locking disk...". */
-		printf("%s.\n", inst->lock >= 0 ? _("succeeded") : _("failed"));
+		printf("%s.\n", locks[i].lock >= 0 ? _("succeeded") : _("failed"));
 
-
-done:
-	if (inst->lock < 0) {
-		free(inst->lockpath);
-		inst->lockpath = NULL;
-	}
 	free(diskpath);
+	}
 	return;
 }
 
 static void unlock_disk(struct fsck_instance *inst)
 {
-	if (inst->lock < 0)
-		return;
 
 	if (verbose)
-		printf(_("Unlocking %s.\n"), inst->lockpath);
-
-	close(inst->lock);			/* unlock */
-
-	free(inst->lockpath);
+		printf(_("Unlocking disks.\n"));
 
-	inst->lock = -1;
-	inst->lockpath = NULL;
+	while(inst->nlocks > 0)
+		close(inst->locks[--(inst->nlocks)].lock);	/* unlock */
+	if (inst->locks != &inst->lock)
+		free(inst->locks);
+	inst->locks = NULL;
 }
 
 static void free_instance(struct fsck_instance *i)
@@ -422,7 +520,8 @@ static void free_instance(struct fsck_instance *i)
 	if (lockdisk)
 		unlock_disk(i);
 	free(i->prog);
-	free(i->lockpath);
+	if (i->locks != &i->lock)
+		free(i->locks);
 	mnt_unref_fs(i->fs);
 	free(i);
 	return;
@@ -469,7 +568,7 @@ static void fs_interpret_type(struct libmnt_fs *fs)
 static int parser_errcb(struct libmnt_table *tb __attribute__ ((__unused__)),
 			const char *filename, int line)
 {
-	warnx(_("%s: parse error at line %d -- ignored"), filename, line);
+	warnx(_("%s: parse error at line %d -- ignore"), filename, line);
 	return 1;
 }
 
@@ -666,7 +765,8 @@ static int execute(const char *progname, const char *progpath,
 
 	mnt_ref_fs(fs);
 	inst->fs = fs;
-	inst->lock = -1;
+	inst->locks = NULL;
+	inst->nlocks = 0;
 
 	if (lockdisk)
 		lock_disk(inst);

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [RFC][WIP][PATCH] fsck -l and lvm/dmcrypt/md/etc
  2016-04-08 21:40 [RFC][WIP][PATCH] fsck -l and lvm/dmcrypt/md/etc Yuriy M. Kaminskiy
@ 2016-04-18 10:59 ` Karel Zak
  2016-04-19 14:42   ` Yuriy M. Kaminskiy
  0 siblings, 1 reply; 4+ messages in thread
From: Karel Zak @ 2016-04-18 10:59 UTC (permalink / raw)
  To: Yuriy M. Kaminskiy; +Cc: util-linux

On Sat, Apr 09, 2016 at 12:40:18AM +0300, Yuriy M. Kaminskiy wrote:
> I think this algorithm should work:
> 
> 1) walk /dev/block/${dev}/slaves recursively (e.g. lvm-over- luks-
> over-lvm- over-md, first-level slaves won't work), collect all
> whole-disk devices;
> 2) sort this list by device numbers (to avoid AB/BA deadlocks), remove
> duplicates;
> 3) acquire all locks consequently.
> 
> There are some unavoidable flaws (e.g. if someone alters structure while fsck
> is running), and some things could be improved, but it should work in
> most cases (and if it fails, it is just no worse than current behavior).
> 
> I've tested with LVM and LUKS-over-LVM on (debian's) 3.16 kernel, it seems
> works as expected.
> 
> What is not covered: /dev/loop* (and I have no plans for it).
> 
> Comments? NACKs?

The question is if we really need it :-) 

All the care fsck has about whole-disks is about performance, because
we assume (on classic rotation disks) that execute fsck on more
partitions in the same time is bad idea.

The problem with stacked devices is that we have no clue about sectors
mapping. Your patch locks all the disks, but maybe kernel will read
only from some subset of the disks, etc. 

>From my point of view the best way is not to care about it in
userspace at all and depend on kernel I/O scheduling only. The
optimization probably still makes sense for classic layout
"HDD->partition", but I don't think we need to extend this
functionality to another use-cases.

    Karel

-- 
 Karel Zak  <kzak@redhat.com>
 http://karelzak.blogspot.com

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC][WIP][PATCH] fsck -l and lvm/dmcrypt/md/etc
  2016-04-18 10:59 ` Karel Zak
@ 2016-04-19 14:42   ` Yuriy M. Kaminskiy
  2016-04-19 15:11     ` [RFC][PATCH alt v2] fsck -l: block concurrent fsck for any stacked devices (Re: [RFC][WIP][PATCH] fsck -l and lvm/dmcrypt/md/etc) Yuriy M. Kaminskiy
  0 siblings, 1 reply; 4+ messages in thread
From: Yuriy M. Kaminskiy @ 2016-04-19 14:42 UTC (permalink / raw)
  To: util-linux

[-- Attachment #1: Type: text/plain, Size: 2193 bytes --]

Karel Zak <kzak@redhat.com> writes:

> On Sat, Apr 09, 2016 at 12:40:18AM +0300, Yuriy M. Kaminskiy wrote:
>> I think this algorithm should work:
>> 
>> 1) walk /dev/block/${dev}/slaves recursively (e.g. lvm-over- luks-
>> over-lvm- over-md, first-level slaves won't work), collect all
>> whole-disk devices;
>> 2) sort this list by device numbers (to avoid AB/BA deadlocks), remove
>> duplicates;
>> 3) acquire all locks consequently.
>> 
>> There are some unavoidable flaws (e.g. if someone alters structure while fsck
>> is running), and some things could be improved, but it should work in
>> most cases (and if it fails, it is just no worse than current behavior).
>> 
>> I've tested with LVM and LUKS-over-LVM on (debian's) 3.16 kernel, it seems
>> works as expected.
>> 
>> What is not covered: /dev/loop* (and I have no plans for it).
>> 
>> Comments? NACKs?
>
> The question is if we really need it :-)

Yes?

> All the care fsck has about whole-disks is about performance, because
> we assume (on classic rotation disks) that execute fsck on more
> partitions in the same time is bad idea.
>
> The problem with stacked devices is that we have no clue about sectors
> mapping. Your patch locks all the disks, but maybe kernel will read
> only from some subset of the disks, etc.

Is this likely scenario? (Or even possible? I can imagine that, with
lvm-over-raid1+, kernel theoretically can schedule one fsck instance to
read only slaveA, and another to read only from slaveB, thus avoiding
seeking; but I suspect, in practice it will just roundrobin all requests
over both slaves, thus same seekstorm.)

> From my point of view the best way is not to care about it in
> userspace at all and depend on kernel I/O scheduling only. The

No problem. Please set in-kernel fsck-friendly io scheduler for fsck
instead. (Oh, there are none exists?)

> optimization probably still makes sense for classic layout
> "HDD->partition", but I don't think we need to extend this
> functionality to another use-cases.

`fsck -A` does not execute fsck for any stacked devices in
parallel. `fsck -l` does. This is inconsistent and unfair. Alternative
(simplier) patch attached (compile-tested only).


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: fsck-l-disallow-parallel-fsck-for-stacked-devices.patch --]
[-- Type: text/x-diff, Size: 1528 bytes --]

diff --git a/disk-utils/fsck.c b/disk-utils/fsck.c
index 84d2dcc..73ec9bd 100644
--- a/disk-utils/fsck.c
+++ b/disk-utils/fsck.c
@@ -109,6 +109,7 @@ struct fsck_instance {
 
 	int	lock;		/* flock()ed lockpath file descriptor or -1 */
 	char	*lockpath;	/* /run/fsck/<diskname>.lock or NULL */
+	int	lock_stacked;	/* flock()ed stacked file descriptor or -1 */
 
 	int	exit_status;
 	struct timeval start_time;
@@ -337,6 +338,7 @@ static int is_irrotational_disk(dev_t disk)
 
 static void lock_disk(struct fsck_instance *inst)
 {
+	char path[PATH_MAX];
 	dev_t disk = fs_get_disk(inst->fs, 1);
 	char *diskpath = NULL, *diskname;
 
@@ -365,6 +367,18 @@ static void lock_disk(struct fsck_instance *inst)
 	if (!diskname)
 		diskname = diskpath;
 
+	snprintf(path, sizeof(path), FSCK_RUNTIME_DIRNAME "/stacked.lock");
+
+	inst->lock_stacked = open(path, O_RDONLY|O_CREAT|O_CLOEXEC,
+				    S_IWUSR|S_IRUSR|S_IRGRP|S_IROTH);
+	if (inst->lock_stacked >= 0) {
+		if (flock(inst->lock_stacked,
+			fs_is_stacked(inst->fs) ? LOCK_EX : LOCK_SH) != 0) {
+			close(inst->lock_stacked);	/* failed */
+			inst->lock_stacked = -1;
+		}
+	}
+ 	
 	xasprintf(&inst->lockpath, FSCK_RUNTIME_DIRNAME "/%s.lock", diskname);
 
 	if (verbose)
@@ -410,10 +424,12 @@ static void unlock_disk(struct fsck_instance *inst)
 		printf(_("Unlocking %s.\n"), inst->lockpath);
 
 	close(inst->lock);			/* unlock */
+	close(inst->lock_stacked);		/* unlock */
 
 	free(inst->lockpath);
 
 	inst->lock = -1;
+	inst->lock_stacked = -1;
 	inst->lockpath = NULL;
 }
 

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [RFC][PATCH alt v2] fsck -l: block concurrent fsck for any stacked devices (Re: [RFC][WIP][PATCH] fsck -l and lvm/dmcrypt/md/etc)
  2016-04-19 14:42   ` Yuriy M. Kaminskiy
@ 2016-04-19 15:11     ` Yuriy M. Kaminskiy
  0 siblings, 0 replies; 4+ messages in thread
From: Yuriy M. Kaminskiy @ 2016-04-19 15:11 UTC (permalink / raw)
  To: util-linux

[-- Attachment #1: Type: text/plain, Size: 2383 bytes --]

yumkam@gmail.com (Yuriy M. Kaminskiy)
writes:

> Karel Zak <kzak@redhat.com> writes:
>
>> On Sat, Apr 09, 2016 at 12:40:18AM +0300, Yuriy M. Kaminskiy wrote:
>>> I think this algorithm should work:
>>> 
>>> 1) walk /dev/block/${dev}/slaves recursively (e.g. lvm-over- luks-
>>> over-lvm- over-md, first-level slaves won't work), collect all
>>> whole-disk devices;
>>> 2) sort this list by device numbers (to avoid AB/BA deadlocks), remove
>>> duplicates;
>>> 3) acquire all locks consequently.
>>> 
>>> There are some unavoidable flaws (e.g. if someone alters structure while fsck
>>> is running), and some things could be improved, but it should work in
>>> most cases (and if it fails, it is just no worse than current behavior).
>>> 
>>> I've tested with LVM and LUKS-over-LVM on (debian's) 3.16 kernel, it seems
>>> works as expected.
>>> 
>>> What is not covered: /dev/loop* (and I have no plans for it).
>>> 
>>> Comments? NACKs?
>>
>> The question is if we really need it :-)
>
> Yes?
>
>> All the care fsck has about whole-disks is about performance, because
>> we assume (on classic rotation disks) that execute fsck on more
>> partitions in the same time is bad idea.
>>
>> The problem with stacked devices is that we have no clue about sectors
>> mapping. Your patch locks all the disks, but maybe kernel will read
>> only from some subset of the disks, etc.
>
> Is this likely scenario? (Or even possible? I can imagine that, with
> lvm-over-raid1+, kernel theoretically can schedule one fsck instance to
> read only slaveA, and another to read only from slaveB, thus avoiding
> seeking; but I suspect, in practice it will just roundrobin all requests
> over both slaves, thus same seekstorm.)
>
>> From my point of view the best way is not to care about it in
>> userspace at all and depend on kernel I/O scheduling only. The
>
> No problem. Please set in-kernel fsck-friendly io scheduler for fsck
> instead. (Oh, there are none exists?)
>
>> optimization probably still makes sense for classic layout
>> "HDD->partition", but I don't think we need to extend this
>> functionality to another use-cases.
>
> `fsck -A` does not execute fsck for any stacked devices in
> parallel. `fsck -l` does. This is inconsistent and unfair. Alternative
> (simplier) patch attached (compile-tested only).

v2 (path variable and snprintf removed, added comment about unlock/unlink).


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: fsck-l-disallow-parallel-fsck-for-stacked-devices.v2.patch --]
[-- Type: text/x-diff, Size: 1372 bytes --]

diff --git a/disk-utils/fsck.c b/disk-utils/fsck.c
index 84d2dcc..bd9114a 100644
--- a/disk-utils/fsck.c
+++ b/disk-utils/fsck.c
@@ -109,6 +109,7 @@ struct fsck_instance {
 
 	int	lock;		/* flock()ed lockpath file descriptor or -1 */
 	char	*lockpath;	/* /run/fsck/<diskname>.lock or NULL */
+	int	lock_stacked;	/* flock()ed stacked file descriptor or -1 */
 
 	int	exit_status;
 	struct timeval start_time;
@@ -365,6 +366,21 @@ static void lock_disk(struct fsck_instance *inst)
 	if (!diskname)
 		diskname = diskpath;
 
+	/*
+	 * Note: stacked.lock can be share-locked and MUST NOT be removed
+	 * upon unlock!
+	 */
+	inst->lock_stacked = open(FSCK_RUNTIME_DIRNAME "/stacked.lock",
+				    O_RDONLY|O_CREAT|O_CLOEXEC,
+				    S_IWUSR|S_IRUSR|S_IRGRP|S_IROTH);
+	if (inst->lock_stacked >= 0) {
+		if (flock(inst->lock_stacked,
+			fs_is_stacked(inst->fs) ? LOCK_EX : LOCK_SH) != 0) {
+			close(inst->lock_stacked);	/* failed */
+			inst->lock_stacked = -1;
+		}
+	}
+ 	
 	xasprintf(&inst->lockpath, FSCK_RUNTIME_DIRNAME "/%s.lock", diskname);
 
 	if (verbose)
@@ -410,10 +426,12 @@ static void unlock_disk(struct fsck_instance *inst)
 		printf(_("Unlocking %s.\n"), inst->lockpath);
 
 	close(inst->lock);			/* unlock */
+	close(inst->lock_stacked);		/* unlock */
 
 	free(inst->lockpath);
 
 	inst->lock = -1;
+	inst->lock_stacked = -1;
 	inst->lockpath = NULL;
 }
 

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2016-04-19 15:12 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-08 21:40 [RFC][WIP][PATCH] fsck -l and lvm/dmcrypt/md/etc Yuriy M. Kaminskiy
2016-04-18 10:59 ` Karel Zak
2016-04-19 14:42   ` Yuriy M. Kaminskiy
2016-04-19 15:11     ` [RFC][PATCH alt v2] fsck -l: block concurrent fsck for any stacked devices (Re: [RFC][WIP][PATCH] fsck -l and lvm/dmcrypt/md/etc) Yuriy M. Kaminskiy

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.