All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/6] Generic per-sb io stats
@ 2022-03-01 18:42 Amir Goldstein
  2022-03-01 18:42 ` [PATCH v3 1/6] lib/percpu_counter: add helpers for arrays of counters Amir Goldstein
                   ` (6 more replies)
  0 siblings, 7 replies; 22+ messages in thread
From: Amir Goldstein @ 2022-03-01 18:42 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: Dave Chinner, Al Viro, linux-unionfs, linux-fsdevel

Miklos,

Following your feedback on v2 [1], I moved the iostats to per-sb.

Thanks,
Amir.

[1] https://lore.kernel.org/linux-unionfs/20220228113910.1727819-1-amir73il@gmail.com/

Changes since v2:
- Change from per-mount to per-sb io stats (szeredi)
- Avoid percpu loop when reading mountstats (dchinner)

Changes since v1:
- Opt-in for per-mount io stats for overlayfs and fuse

Amir Goldstein (6):
  lib/percpu_counter: add helpers for arrays of counters
  fs: add optional iostats counters to struct super_block
  fs: collect per-sb io stats
  fs: report per-sb io stats
  ovl: opt-in for per-sb io stats
  fuse: opt-in for per-sb io stats

 fs/Kconfig                     |   8 +++
 fs/fuse/inode.c                |   5 ++
 fs/nfsd/export.c               |   7 +-
 fs/nfsd/nfscache.c             |   5 +-
 fs/nfsd/stats.c                |  37 +---------
 fs/nfsd/stats.h                |   3 -
 fs/overlayfs/super.c           |   5 ++
 fs/proc_namespace.c            |  16 +++++
 fs/read_write.c                |  88 ++++++++++++++++-------
 fs/super.c                     |   2 +
 include/linux/fs.h             |  10 ++-
 include/linux/fs_iostats.h     | 127 +++++++++++++++++++++++++++++++++
 include/linux/percpu_counter.h |  28 ++++++++
 lib/percpu_counter.c           |  27 +++++++
 14 files changed, 300 insertions(+), 68 deletions(-)
 create mode 100644 include/linux/fs_iostats.h

-- 
2.25.1


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v3 1/6] lib/percpu_counter: add helpers for arrays of counters
  2022-03-01 18:42 [PATCH v3 0/6] Generic per-sb io stats Amir Goldstein
@ 2022-03-01 18:42 ` Amir Goldstein
  2022-03-01 18:42 ` [PATCH v3 2/6] fs: add optional iostats counters to struct super_block Amir Goldstein
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 22+ messages in thread
From: Amir Goldstein @ 2022-03-01 18:42 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: Dave Chinner, Al Viro, linux-unionfs, linux-fsdevel

Hoist the helpers to init/reset/destroy an array of counters from
nfsd_stats to percpu_counter library.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/nfsd/export.c               |  7 ++++---
 fs/nfsd/nfscache.c             |  5 +++--
 fs/nfsd/stats.c                | 37 +++-------------------------------
 fs/nfsd/stats.h                |  3 ---
 include/linux/percpu_counter.h | 28 +++++++++++++++++++++++++
 lib/percpu_counter.c           | 27 +++++++++++++++++++++++++
 6 files changed, 65 insertions(+), 42 deletions(-)

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 668c7527b17e..20770f049ac3 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -334,17 +334,18 @@ static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc)
 static int export_stats_init(struct export_stats *stats)
 {
 	stats->start_time = ktime_get_seconds();
-	return nfsd_percpu_counters_init(stats->counter, EXP_STATS_COUNTERS_NUM);
+	return percpu_counters_init(stats->counter, EXP_STATS_COUNTERS_NUM, 0,
+				    GFP_KERNEL);
 }
 
 static void export_stats_reset(struct export_stats *stats)
 {
-	nfsd_percpu_counters_reset(stats->counter, EXP_STATS_COUNTERS_NUM);
+	percpu_counters_set(stats->counter, EXP_STATS_COUNTERS_NUM, 0);
 }
 
 static void export_stats_destroy(struct export_stats *stats)
 {
-	nfsd_percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM);
+	percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM);
 }
 
 static void svc_export_put(struct kref *ref)
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index a4a69ab6ab28..78e3820ea423 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -156,12 +156,13 @@ void nfsd_drc_slab_free(void)
 
 static int nfsd_reply_cache_stats_init(struct nfsd_net *nn)
 {
-	return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM);
+	return percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM, 0,
+				    GFP_KERNEL);
 }
 
 static void nfsd_reply_cache_stats_destroy(struct nfsd_net *nn)
 {
-	nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM);
+	percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM);
 }
 
 int nfsd_reply_cache_init(struct nfsd_net *nn)
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index a8c5a02a84f0..933e703cbb3b 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -84,46 +84,15 @@ static const struct proc_ops nfsd_proc_ops = {
 	.proc_release	= single_release,
 };
 
-int nfsd_percpu_counters_init(struct percpu_counter counters[], int num)
-{
-	int i, err = 0;
-
-	for (i = 0; !err && i < num; i++)
-		err = percpu_counter_init(&counters[i], 0, GFP_KERNEL);
-
-	if (!err)
-		return 0;
-
-	for (; i > 0; i--)
-		percpu_counter_destroy(&counters[i-1]);
-
-	return err;
-}
-
-void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num)
-{
-	int i;
-
-	for (i = 0; i < num; i++)
-		percpu_counter_set(&counters[i], 0);
-}
-
-void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num)
-{
-	int i;
-
-	for (i = 0; i < num; i++)
-		percpu_counter_destroy(&counters[i]);
-}
-
 static int nfsd_stat_counters_init(void)
 {
-	return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
+	return percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM,
+				    0, GFP_KERNEL);
 }
 
 static void nfsd_stat_counters_destroy(void)
 {
-	nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
+	percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
 }
 
 int nfsd_stat_init(void)
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index 9b43dc3d9991..61840f9035a9 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -36,9 +36,6 @@ extern struct nfsd_stats	nfsdstats;
 
 extern struct svc_stat		nfsd_svcstats;
 
-int nfsd_percpu_counters_init(struct percpu_counter counters[], int num);
-void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num);
-void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num);
 int nfsd_stat_init(void);
 void nfsd_stat_shutdown(void);
 
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 01861eebed79..2051aab02d5d 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -46,6 +46,10 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc);
 int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
 void percpu_counter_sync(struct percpu_counter *fbc);
 
+int percpu_counters_init(struct percpu_counter counters[], int num, s64 amount,
+			 gfp_t gfp);
+void percpu_counters_destroy(struct percpu_counter counters[], int num);
+
 static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
 {
 	return __percpu_counter_compare(fbc, rhs, percpu_counter_batch);
@@ -105,10 +109,25 @@ static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount,
 	return 0;
 }
 
+static inline int percpu_counters_init(struct percpu_counter counters[],
+				       int num, s64 amount, gfp_t gfp)
+{
+	int i;
+
+	for (i = 0; i < num; i++)
+		percpu_counter_init(&counters[i], amount, gfp);
+	return 0;
+}
+
 static inline void percpu_counter_destroy(struct percpu_counter *fbc)
 {
 }
 
+static inline void percpu_counters_destroy(struct percpu_counter counters[],
+					   int num)
+{
+}
+
 static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
 {
 	fbc->count = amount;
@@ -193,4 +212,13 @@ static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount)
 	percpu_counter_add(fbc, -amount);
 }
 
+static inline void percpu_counters_set(struct percpu_counter counters[],
+				       int num, s64 amount)
+{
+	int i;
+
+	for (i = 0; i < num; i++)
+		percpu_counter_set(&counters[i], amount);
+}
+
 #endif /* _LINUX_PERCPU_COUNTER_H */
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index ed610b75dc32..f75a45c63c18 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -181,6 +181,33 @@ void percpu_counter_destroy(struct percpu_counter *fbc)
 }
 EXPORT_SYMBOL(percpu_counter_destroy);
 
+int percpu_counters_init(struct percpu_counter counters[], int num, s64 amount,
+			 gfp_t gfp)
+{
+	int i, err = 0;
+
+	for (i = 0; !err && i < num; i++)
+		err = percpu_counter_init(&counters[i], amount, gfp);
+
+	if (!err)
+		return 0;
+
+	for (; i > 0; i--)
+		percpu_counter_destroy(&counters[i-1]);
+
+	return err;
+}
+EXPORT_SYMBOL(percpu_counters_init);
+
+void percpu_counters_destroy(struct percpu_counter counters[], int num)
+{
+	int i;
+
+	for (i = 0; i < num; i++)
+		percpu_counter_destroy(&counters[i]);
+}
+EXPORT_SYMBOL(percpu_counters_destroy);
+
 int percpu_counter_batch __read_mostly = 32;
 EXPORT_SYMBOL(percpu_counter_batch);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v3 2/6] fs: add optional iostats counters to struct super_block
  2022-03-01 18:42 [PATCH v3 0/6] Generic per-sb io stats Amir Goldstein
  2022-03-01 18:42 ` [PATCH v3 1/6] lib/percpu_counter: add helpers for arrays of counters Amir Goldstein
@ 2022-03-01 18:42 ` Amir Goldstein
  2022-03-01 18:42 ` [PATCH v3 3/6] fs: collect per-sb io stats Amir Goldstein
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 22+ messages in thread
From: Amir Goldstein @ 2022-03-01 18:42 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: Dave Chinner, Al Viro, linux-unionfs, linux-fsdevel

With CONFIG_FS_IOSTATS, filesystem can attach an array of counters to
struct super_block by calling sb_iostats_init().

These counters will be used to collect per-sb I/O statistics and display
them in /proc/<pid>/mountstats.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/Kconfig                 |   8 +++
 fs/super.c                 |   2 +
 include/linux/fs.h         |  10 ++-
 include/linux/fs_iostats.h | 127 +++++++++++++++++++++++++++++++++++++
 4 files changed, 145 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/fs_iostats.h

diff --git a/fs/Kconfig b/fs/Kconfig
index 6c7dc1387beb..394d9da6bda9 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -15,6 +15,14 @@ config VALIDATE_FS_PARSER
 	  Enable this to perform validation of the parameter description for a
 	  filesystem when it is registered.
 
+config FS_IOSTATS
+	bool "Enable generic filesystem I/O statistics"
+	help
+	  Enable this to allow collecting filesystem I/O statistics and display
+	  them in /proc/<pid>/mountstats.
+
+	  Say N if unsure.
+
 config FS_IOMAP
 	bool
 
diff --git a/fs/super.c b/fs/super.c
index f1d4a193602d..c447cadb402b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -36,6 +36,7 @@
 #include <linux/lockdep.h>
 #include <linux/user_namespace.h>
 #include <linux/fs_context.h>
+#include <linux/fs_iostats.h>
 #include <uapi/linux/mount.h>
 #include "internal.h"
 
@@ -290,6 +291,7 @@ static void __put_super(struct super_block *s)
 		WARN_ON(s->s_dentry_lru.node);
 		WARN_ON(s->s_inode_lru.node);
 		WARN_ON(!list_empty(&s->s_mounts));
+		sb_iostats_destroy(s);
 		security_sb_free(s);
 		fscrypt_sb_free(s);
 		put_user_ns(s->s_user_ns);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e2d892b201b0..a71c94cbb6c1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1454,6 +1454,8 @@ struct sb_writers {
 	struct percpu_rw_semaphore	rw_sem[SB_FREEZE_LEVELS];
 };
 
+struct sb_iostats;
+
 struct super_block {
 	struct list_head	s_list;		/* Keep this first */
 	dev_t			s_dev;		/* search index; _not_ kdev_t */
@@ -1508,8 +1510,12 @@ struct super_block {
 	/* Granularity of c/m/atime in ns (cannot be worse than a second) */
 	u32			s_time_gran;
 	/* Time limits for c/m/atime in seconds */
-	time64_t		   s_time_min;
-	time64_t		   s_time_max;
+	time64_t		s_time_min;
+	time64_t		s_time_max;
+#ifdef CONFIG_FS_IOSTATS
+	/* Optional per-sb I/O stats */
+	struct sb_iostats	*s_iostats;
+#endif
 #ifdef CONFIG_FSNOTIFY
 	__u32			s_fsnotify_mask;
 	struct fsnotify_mark_connector __rcu	*s_fsnotify_marks;
diff --git a/include/linux/fs_iostats.h b/include/linux/fs_iostats.h
new file mode 100644
index 000000000000..60d1efbea7d9
--- /dev/null
+++ b/include/linux/fs_iostats.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_FS_IOSTATS_H
+#define _LINUX_FS_IOSTATS_H
+
+#include <linux/fs.h>
+#include <linux/percpu_counter.h>
+#include <linux/slab.h>
+
+/* Similar to task_io_accounting members */
+enum {
+	SB_IOSTATS_CHARS_RD,	/* bytes read via syscalls */
+	SB_IOSTATS_CHARS_WR,	/* bytes written via syscalls */
+	SB_IOSTATS_SYSCALLS_RD,	/* # of read syscalls */
+	SB_IOSTATS_SYSCALLS_WR,	/* # of write syscalls */
+	SB_IOSTATS_COUNTERS_NUM
+};
+
+struct sb_iostats {
+	time64_t		start_time;
+	struct percpu_counter	counter[SB_IOSTATS_COUNTERS_NUM];
+};
+
+#ifdef CONFIG_FS_IOSTATS
+static inline struct sb_iostats *sb_iostats(struct super_block *sb)
+{
+	return sb->s_iostats;
+}
+
+static inline bool sb_has_iostats(struct super_block *sb)
+{
+	return !!sb->s_iostats;
+}
+
+static inline void sb_iostats_destroy(struct super_block *sb)
+{
+	if (!sb->s_iostats)
+		return;
+
+	percpu_counters_destroy(sb->s_iostats->counter,
+				SB_IOSTATS_COUNTERS_NUM);
+	kfree(sb->s_iostats);
+	sb->s_iostats = NULL;
+}
+
+/* Initialize per-sb I/O stats */
+static inline int sb_iostats_init(struct super_block *sb)
+{
+	int err;
+
+	sb->s_iostats = kmalloc(sizeof(struct sb_iostats), GFP_KERNEL);
+	if (!sb->s_iostats)
+		return -ENOMEM;
+
+	err = percpu_counters_init(sb->s_iostats->counter,
+				   SB_IOSTATS_COUNTERS_NUM, 0, GFP_KERNEL);
+	if (err) {
+		kfree(sb->s_iostats);
+		sb->s_iostats = NULL;
+		return err;
+	}
+
+	sb->s_iostats->start_time = ktime_get_seconds();
+	return 0;
+}
+
+static inline void sb_iostats_counter_inc(struct super_block *sb, int id)
+{
+	if (!sb->s_iostats)
+		return;
+
+	percpu_counter_inc(&sb->s_iostats->counter[id]);
+}
+
+static inline void sb_iostats_counter_add(struct super_block *sb, int id,
+					  s64 amt)
+{
+	if (!sb->s_iostats)
+		return;
+
+	percpu_counter_add(&sb->s_iostats->counter[id], amt);
+}
+
+static inline s64 sb_iostats_counter_read(struct super_block *sb, int id)
+{
+	if (!sb->s_iostats)
+		return 0;
+
+	return percpu_counter_read_positive(&sb->s_iostats->counter[id]);
+}
+
+#else /* !CONFIG_FS_IOSTATS */
+
+static inline struct sb_iostats *sb_iostats(struct super_block *sb)
+{
+	return NULL;
+}
+
+static inline bool sb_has_iostats(struct super_block *sb)
+{
+	return false;
+}
+
+static inline int sb_iostats_init(struct super_block *sb)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void sb_iostats_destroy(struct super_block *sb)
+{
+}
+
+static inline void sb_iostats_counter_inc(struct super_block *sb, int id)
+{
+}
+
+static inline void sb_iostats_counter_add(struct super_block *sb, int id,
+					  s64 amt)
+{
+}
+
+static inline s64 sb_iostats_counter_read(struct super_block *sb, int id)
+{
+	return 0;
+}
+#endif
+
+#endif /* _LINUX_FS_IOSTATS_H */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v3 3/6] fs: collect per-sb io stats
  2022-03-01 18:42 [PATCH v3 0/6] Generic per-sb io stats Amir Goldstein
  2022-03-01 18:42 ` [PATCH v3 1/6] lib/percpu_counter: add helpers for arrays of counters Amir Goldstein
  2022-03-01 18:42 ` [PATCH v3 2/6] fs: add optional iostats counters to struct super_block Amir Goldstein
@ 2022-03-01 18:42 ` Amir Goldstein
  2022-03-01 18:42 ` [PATCH v3 4/6] fs: report " Amir Goldstein
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 22+ messages in thread
From: Amir Goldstein @ 2022-03-01 18:42 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: Dave Chinner, Al Viro, linux-unionfs, linux-fsdevel

Replace task io account helpers with wrappers that may also collect
per-sb io stats.

Filesystems that want these per-sb io stats collected need to opt-in
with sb_iostats_init().

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/read_write.c | 88 +++++++++++++++++++++++++++++++++++--------------
 1 file changed, 64 insertions(+), 24 deletions(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index 0074afa7ecb3..8c599bf2dd78 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -20,6 +20,7 @@
 #include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/fs.h>
+#include <linux/fs_iostats.h>
 #include "internal.h"
 
 #include <linux/uaccess.h>
@@ -34,6 +35,45 @@ const struct file_operations generic_ro_fops = {
 
 EXPORT_SYMBOL(generic_ro_fops);
 
+static inline void file_iostats_counter_inc(struct file *file, int id)
+{
+	if (file)
+		sb_iostats_counter_inc(file->f_path.mnt->mnt_sb, id);
+}
+
+static inline void file_iostats_counter_add(struct file *file, int id,
+					    ssize_t amt)
+{
+	if (file)
+		sb_iostats_counter_add(file->f_path.mnt->mnt_sb, id, amt);
+}
+
+static void file_add_rchar(struct file *file, struct task_struct *tsk,
+			   ssize_t amt)
+{
+	file_iostats_counter_add(file, SB_IOSTATS_CHARS_RD, amt);
+	add_rchar(tsk, amt);
+}
+
+static void file_add_wchar(struct file *file, struct task_struct *tsk,
+			   ssize_t amt)
+{
+	file_iostats_counter_add(file, SB_IOSTATS_CHARS_WR, amt);
+	add_wchar(tsk, amt);
+}
+
+static void file_inc_syscr(struct file *file, struct task_struct *tsk)
+{
+	file_iostats_counter_inc(file, SB_IOSTATS_SYSCALLS_RD);
+	inc_syscr(current);
+}
+
+static void file_inc_syscw(struct file *file, struct task_struct *tsk)
+{
+	file_iostats_counter_inc(file, SB_IOSTATS_SYSCALLS_WR);
+	inc_syscw(current);
+}
+
 static inline bool unsigned_offsets(struct file *file)
 {
 	return file->f_mode & FMODE_UNSIGNED_OFFSET;
@@ -441,9 +481,9 @@ ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 		if (pos)
 			*pos = kiocb.ki_pos;
 		fsnotify_access(file);
-		add_rchar(current, ret);
+		file_add_rchar(file, current, ret);
 	}
-	inc_syscr(current);
+	file_inc_syscr(file, current);
 	return ret;
 }
 
@@ -483,9 +523,9 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 		ret = -EINVAL;
 	if (ret > 0) {
 		fsnotify_access(file);
-		add_rchar(current, ret);
+		file_add_rchar(file, current, ret);
 	}
-	inc_syscr(current);
+	file_inc_syscr(file, current);
 	return ret;
 }
 
@@ -537,9 +577,9 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
 		if (pos)
 			*pos = kiocb.ki_pos;
 		fsnotify_modify(file);
-		add_wchar(current, ret);
+		file_add_wchar(file, current, ret);
 	}
-	inc_syscw(current);
+	file_inc_syscw(file, current);
 	return ret;
 }
 /*
@@ -592,9 +632,9 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
 		ret = -EINVAL;
 	if (ret > 0) {
 		fsnotify_modify(file);
-		add_wchar(current, ret);
+		file_add_wchar(file, current, ret);
 	}
-	inc_syscw(current);
+	file_inc_syscw(file, current);
 	file_end_write(file);
 	return ret;
 }
@@ -947,8 +987,8 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
 	}
 
 	if (ret > 0)
-		add_rchar(current, ret);
-	inc_syscr(current);
+		file_add_rchar(f.file, current, ret);
+	file_inc_syscr(f.file, current);
 	return ret;
 }
 
@@ -971,8 +1011,8 @@ static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
 	}
 
 	if (ret > 0)
-		add_wchar(current, ret);
-	inc_syscw(current);
+		file_add_wchar(f.file, current, ret);
+	file_inc_syscw(f.file, current);
 	return ret;
 }
 
@@ -1000,8 +1040,8 @@ static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
 	}
 
 	if (ret > 0)
-		add_rchar(current, ret);
-	inc_syscr(current);
+		file_add_rchar(f.file, current, ret);
+	file_inc_syscr(f.file, current);
 	return ret;
 }
 
@@ -1023,8 +1063,8 @@ static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
 	}
 
 	if (ret > 0)
-		add_wchar(current, ret);
-	inc_syscw(current);
+		file_add_wchar(f.file, current, ret);
+	file_inc_syscw(f.file, current);
 	return ret;
 }
 
@@ -1250,8 +1290,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 	}
 
 	if (retval > 0) {
-		add_rchar(current, retval);
-		add_wchar(current, retval);
+		file_add_rchar(in.file, current, retval);
+		file_add_wchar(out.file, current, retval);
 		fsnotify_access(in.file);
 		fsnotify_modify(out.file);
 		out.file->f_pos = out_pos;
@@ -1261,8 +1301,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 			in.file->f_pos = pos;
 	}
 
-	inc_syscr(current);
-	inc_syscw(current);
+	file_inc_syscr(in.file, current);
+	file_inc_syscw(out.file, current);
 	if (pos > max)
 		retval = -EOVERFLOW;
 
@@ -1511,13 +1551,13 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 done:
 	if (ret > 0) {
 		fsnotify_access(file_in);
-		add_rchar(current, ret);
+		file_add_rchar(file_in, current, ret);
 		fsnotify_modify(file_out);
-		add_wchar(current, ret);
+		file_add_wchar(file_out, current, ret);
 	}
 
-	inc_syscr(current);
-	inc_syscw(current);
+	file_inc_syscr(file_in, current);
+	file_inc_syscw(file_out, current);
 
 	file_end_write(file_out);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v3 4/6] fs: report per-sb io stats
  2022-03-01 18:42 [PATCH v3 0/6] Generic per-sb io stats Amir Goldstein
                   ` (2 preceding siblings ...)
  2022-03-01 18:42 ` [PATCH v3 3/6] fs: collect per-sb io stats Amir Goldstein
@ 2022-03-01 18:42 ` Amir Goldstein
  2022-03-01 18:42 ` [PATCH v3 5/6] ovl: opt-in for " Amir Goldstein
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 22+ messages in thread
From: Amir Goldstein @ 2022-03-01 18:42 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: Dave Chinner, Al Viro, linux-unionfs, linux-fsdevel

Show optional collected per-sb io stats in /proc/<pid>/mountstats
for filesystems that do not implement their own show_stats() method
and opted-in to generic per-sb stats with sb_iostats_init().

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/proc_namespace.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 49650e54d2f8..9054a909e031 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -11,6 +11,7 @@
 #include <linux/nsproxy.h>
 #include <linux/security.h>
 #include <linux/fs_struct.h>
+#include <linux/fs_iostats.h>
 #include <linux/sched/task.h>
 
 #include "proc/internal.h" /* only for get_proc_task() in ->open() */
@@ -232,6 +233,21 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
 	if (sb->s_op->show_stats) {
 		seq_putc(m, ' ');
 		err = sb->s_op->show_stats(m, mnt_path.dentry);
+	} else if (sb_has_iostats(sb)) {
+		struct sb_iostats *iostats = sb_iostats(sb);
+
+		/* Similar to /proc/<pid>/io */
+		seq_printf(m, "\n"
+			   "\ttimes: %lld %lld\n"
+			   "\trchar: %lld\n"
+			   "\twchar: %lld\n"
+			   "\tsyscr: %lld\n"
+			   "\tsyscw: %lld\n",
+			   iostats->start_time, ktime_get_seconds(),
+			   sb_iostats_counter_read(sb, SB_IOSTATS_CHARS_RD),
+			   sb_iostats_counter_read(sb, SB_IOSTATS_CHARS_WR),
+			   sb_iostats_counter_read(sb, SB_IOSTATS_SYSCALLS_RD),
+			   sb_iostats_counter_read(sb, SB_IOSTATS_SYSCALLS_WR));
 	}
 
 	seq_putc(m, '\n');
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v3 5/6] ovl: opt-in for per-sb io stats
  2022-03-01 18:42 [PATCH v3 0/6] Generic per-sb io stats Amir Goldstein
                   ` (3 preceding siblings ...)
  2022-03-01 18:42 ` [PATCH v3 4/6] fs: report " Amir Goldstein
@ 2022-03-01 18:42 ` Amir Goldstein
  2022-03-01 18:42 ` [PATCH v3 6/6] fuse: " Amir Goldstein
  2022-03-02  6:59 ` [PATCH v3 0/6] Generic " Dave Chinner
  6 siblings, 0 replies; 22+ messages in thread
From: Amir Goldstein @ 2022-03-01 18:42 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: Dave Chinner, Al Viro, linux-unionfs, linux-fsdevel

Traditionally, system administrators have used the iostat utility
to track the amount of io performed to a local disk filesystem.

Similar functionality is provided for NFS mounts via the nfsstat
utility that reads the NFS client's stats from /proc/pid/mountstats.

There is currently no good way for a system administrator or a
monitoring application inside a container to track the amount of io
performed via overlayfs.

Opt-in for generic io stats via /proc/pid/mountstats to provide
that functionality.

This feature depends on CONFIG_FS_IOSTATS.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/overlayfs/super.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 7bb0a47cb615..94ab6adebe07 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -6,6 +6,7 @@
 
 #include <uapi/linux/magic.h>
 #include <linux/fs.h>
+#include <linux/fs_iostats.h>
 #include <linux/namei.h>
 #include <linux/xattr.h>
 #include <linux/mount.h>
@@ -1979,6 +1980,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_d_op = &ovl_dentry_operations;
 
+	err = sb_iostats_init(sb);
+	if (err && err != -EOPNOTSUPP)
+		goto out;
+
 	err = -ENOMEM;
 	ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
 	if (!ofs)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v3 6/6] fuse: opt-in for per-sb io stats
  2022-03-01 18:42 [PATCH v3 0/6] Generic per-sb io stats Amir Goldstein
                   ` (4 preceding siblings ...)
  2022-03-01 18:42 ` [PATCH v3 5/6] ovl: opt-in for " Amir Goldstein
@ 2022-03-01 18:42 ` Amir Goldstein
  2022-03-02  6:59 ` [PATCH v3 0/6] Generic " Dave Chinner
  6 siblings, 0 replies; 22+ messages in thread
From: Amir Goldstein @ 2022-03-01 18:42 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: Dave Chinner, Al Viro, linux-unionfs, linux-fsdevel

Traditionally, system administrators have used the iostat utility
to track the amount of io performed to a local disk filesystem.

Similar functionality is provided for NFS mounts via the nfsstat
utility that reads the NFS client's stats from /proc/pid/mountstats.

There is currently no good way for a system administrator or a
monitoring application to track the amount of io performed via fuse
filesystems.

Opt-in for generic io stats via /proc/pid/mountstats to provide
that functionality.

It is possible to collect io stats on the server side inside libfuse,
but those io stats will not cover cached writes and reads.  Therefore,
implementing the server side io stats would be complementary to these
client side io stats.  Also, this feature provides the io stats for
existing fuse filesystem/lib release binaries.

This feature depends on CONFIG_FS_IOSTATS.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/fuse/inode.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9ee36aa73251..f19c666b9ac3 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/fs_context.h>
+#include <linux/fs_iostats.h>
 #include <linux/fs_parser.h>
 #include <linux/statfs.h>
 #include <linux/random.h>
@@ -1517,6 +1518,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
 	if (sb->s_flags & SB_MANDLOCK)
 		goto err;
 
+	err = sb_iostats_init(sb);
+	if (err && err != -EOPNOTSUPP)
+		goto err;
+
 	rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc());
 	fuse_sb_defaults(sb);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v3 0/6] Generic per-sb io stats
  2022-03-01 18:42 [PATCH v3 0/6] Generic per-sb io stats Amir Goldstein
                   ` (5 preceding siblings ...)
  2022-03-01 18:42 ` [PATCH v3 6/6] fuse: " Amir Goldstein
@ 2022-03-02  6:59 ` Dave Chinner
  2022-03-02  7:43   ` Amir Goldstein
  6 siblings, 1 reply; 22+ messages in thread
From: Dave Chinner @ 2022-03-02  6:59 UTC (permalink / raw)
  To: Amir Goldstein; +Cc: Miklos Szeredi, Al Viro, linux-unionfs, linux-fsdevel

On Tue, Mar 01, 2022 at 08:42:15PM +0200, Amir Goldstein wrote:
> Miklos,
> 
> Following your feedback on v2 [1], I moved the iostats to per-sb.
> 
> Thanks,
> Amir.
> 
> [1] https://lore.kernel.org/linux-unionfs/20220228113910.1727819-1-amir73il@gmail.com/
> 
> Changes since v2:
> - Change from per-mount to per-sb io stats (szeredi)
> - Avoid percpu loop when reading mountstats (dchinner)
> 
> Changes since v1:
> - Opt-in for per-mount io stats for overlayfs and fuse

Why make it optional only for specific filesystem types? Shouldn't
every superblock capture these stats and export them in exactly the
same place?

Making it properly generic greatly simplifies the implementation,
too...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v3 0/6] Generic per-sb io stats
  2022-03-02  6:59 ` [PATCH v3 0/6] Generic " Dave Chinner
@ 2022-03-02  7:43   ` Amir Goldstein
  2022-03-02  8:26     ` Dave Chinner
  0 siblings, 1 reply; 22+ messages in thread
From: Amir Goldstein @ 2022-03-02  7:43 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Miklos Szeredi, Al Viro, overlayfs, linux-fsdevel

On Wed, Mar 2, 2022 at 8:59 AM Dave Chinner <david@fromorbit.com> wrote:
>
> On Tue, Mar 01, 2022 at 08:42:15PM +0200, Amir Goldstein wrote:
> > Miklos,
> >
> > Following your feedback on v2 [1], I moved the iostats to per-sb.
> >
> > Thanks,
> > Amir.
> >
> > [1] https://lore.kernel.org/linux-unionfs/20220228113910.1727819-1-amir73il@gmail.com/
> >
> > Changes since v2:
> > - Change from per-mount to per-sb io stats (szeredi)
> > - Avoid percpu loop when reading mountstats (dchinner)
> >
> > Changes since v1:
> > - Opt-in for per-mount io stats for overlayfs and fuse
>
> Why make it optional only for specific filesystem types? Shouldn't
> every superblock capture these stats and export them in exactly the
> same place?
>

I am not sure what you are asking.

Any filesystem can opt-in to get those generic io stats.
This is exactly the same as any filesystem can already opt-in for
fs specific io stats using the s_op->show_stats() vfs op.

All I did was to provide a generic implementation.
The generic io stats are collected and displayed for all filesystems the
same way.

I only included patches for overlayfs and fuse to opt-in for generic io stats,
because I think those stats should be reported unconditionally (to
mount options)
for fuse/overlayfs and I hope that Miklos agrees with me.

If there is wide consensus that all filesystems should have those stats
unconditionally (to mount options), then I can post another patch to make
the behavior not opt-in, but I have a feeling that this discussion
will take longer
than I care to wait before enabling the io stats for fuse/overlayfs,
so I would rather
start with enabling io stats for fuse/overlayfs and decide about the rest later.

How would you prefer the io stats behavior for xfs (or any fs) to be?
Unconditional to mount options?
Opt-in with mount option? (suggest name please)
Opt-in/out with mount options and default with Kconfig/sysfs tunable?
Anything else?

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v3 0/6] Generic per-sb io stats
  2022-03-02  7:43   ` Amir Goldstein
@ 2022-03-02  8:26     ` Dave Chinner
  2022-03-02  8:34       ` Amir Goldstein
  2022-03-02 16:59       ` Amir Goldstein
  0 siblings, 2 replies; 22+ messages in thread
From: Dave Chinner @ 2022-03-02  8:26 UTC (permalink / raw)
  To: Amir Goldstein; +Cc: Miklos Szeredi, Al Viro, overlayfs, linux-fsdevel

On Wed, Mar 02, 2022 at 09:43:50AM +0200, Amir Goldstein wrote:
> On Wed, Mar 2, 2022 at 8:59 AM Dave Chinner <david@fromorbit.com> wrote:
> >
> > On Tue, Mar 01, 2022 at 08:42:15PM +0200, Amir Goldstein wrote:
> > > Miklos,
> > >
> > > Following your feedback on v2 [1], I moved the iostats to per-sb.
> > >
> > > Thanks,
> > > Amir.
> > >
> > > [1] https://lore.kernel.org/linux-unionfs/20220228113910.1727819-1-amir73il@gmail.com/
> > >
> > > Changes since v2:
> > > - Change from per-mount to per-sb io stats (szeredi)
> > > - Avoid percpu loop when reading mountstats (dchinner)
> > >
> > > Changes since v1:
> > > - Opt-in for per-mount io stats for overlayfs and fuse
> >
> > Why make it optional only for specific filesystem types? Shouldn't
> > every superblock capture these stats and export them in exactly the
> > same place?
> >
> 
> I am not sure what you are asking.
> 
> Any filesystem can opt-in to get those generic io stats.

Yes, but why even make it opt-in? Why not just set these up
unconditionally in alloc_super() for all filesystems? Either this is
useful information for userspace montioring and diagnostics, or it's
not useful at all. If it is useful, then all superblocks should
export this stuff rather than just some special subset of
filesystems where individual maintainers have noticed it and thought
"that might be useful".

Just enable it for every superblock....

> This is exactly the same as any filesystem can already opt-in for
> fs specific io stats using the s_op->show_stats() vfs op.
> 
> All I did was to provide a generic implementation.
> The generic io stats are collected and displayed for all filesystems the
> same way.
> 
> I only included patches for overlayfs and fuse to opt-in for generic io stats,
> because I think those stats should be reported unconditionally (to
> mount options)
> for fuse/overlayfs and I hope that Miklos agrees with me.

Yup, and I'm asking you why it should be optional - no filesystem
ever sees this information - it's totally generic VFS level code
except for the structure allocation. What's the point of *not*
enabling it for every superblock unconditionally?

> If there is wide consensus that all filesystems should have those stats
> unconditionally (to mount options), then I can post another patch to make
> the behavior not opt-in, but I have a feeling that this discussion

That's exactly what I want you to do. We're already having this
discussion, so let's get it over and done with right now.

> How would you prefer the io stats behavior for xfs (or any fs) to be?
> Unconditional to mount options?
> Opt-in with mount option? (suggest name please)
> Opt-in/out with mount options and default with Kconfig/sysfs tunable?
> Anything else?

Unconditional, for all filesystems, so they all display the same
stats in exactly same place without any filesystem having to
implement a single line of code anywhere. A single kconfig option
like you already hav is just fine to turn it off for those that
don't want to use it.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v3 0/6] Generic per-sb io stats
  2022-03-02  8:26     ` Dave Chinner
@ 2022-03-02  8:34       ` Amir Goldstein
  2022-03-02 16:59       ` Amir Goldstein
  1 sibling, 0 replies; 22+ messages in thread
From: Amir Goldstein @ 2022-03-02  8:34 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Miklos Szeredi, Al Viro, overlayfs, linux-fsdevel

On Wed, Mar 2, 2022 at 10:27 AM Dave Chinner <david@fromorbit.com> wrote:
>
> On Wed, Mar 02, 2022 at 09:43:50AM +0200, Amir Goldstein wrote:
> > On Wed, Mar 2, 2022 at 8:59 AM Dave Chinner <david@fromorbit.com> wrote:
> > >
> > > On Tue, Mar 01, 2022 at 08:42:15PM +0200, Amir Goldstein wrote:
> > > > Miklos,
> > > >
> > > > Following your feedback on v2 [1], I moved the iostats to per-sb.
> > > >
> > > > Thanks,
> > > > Amir.
> > > >
> > > > [1] https://lore.kernel.org/linux-unionfs/20220228113910.1727819-1-amir73il@gmail.com/
> > > >
> > > > Changes since v2:
> > > > - Change from per-mount to per-sb io stats (szeredi)
> > > > - Avoid percpu loop when reading mountstats (dchinner)
> > > >
> > > > Changes since v1:
> > > > - Opt-in for per-mount io stats for overlayfs and fuse
> > >
> > > Why make it optional only for specific filesystem types? Shouldn't
> > > every superblock capture these stats and export them in exactly the
> > > same place?
> > >
> >
> > I am not sure what you are asking.
> >
> > Any filesystem can opt-in to get those generic io stats.
>
> Yes, but why even make it opt-in? Why not just set these up
> unconditionally in alloc_super() for all filesystems? Either this is
> useful information for userspace montioring and diagnostics, or it's
> not useful at all. If it is useful, then all superblocks should
> export this stuff rather than just some special subset of
> filesystems where individual maintainers have noticed it and thought
> "that might be useful".
>
> Just enable it for every superblock....
>
> > This is exactly the same as any filesystem can already opt-in for
> > fs specific io stats using the s_op->show_stats() vfs op.
> >
> > All I did was to provide a generic implementation.
> > The generic io stats are collected and displayed for all filesystems the
> > same way.
> >
> > I only included patches for overlayfs and fuse to opt-in for generic io stats,
> > because I think those stats should be reported unconditionally (to
> > mount options)
> > for fuse/overlayfs and I hope that Miklos agrees with me.
>
> Yup, and I'm asking you why it should be optional - no filesystem
> ever sees this information - it's totally generic VFS level code
> except for the structure allocation. What's the point of *not*
> enabling it for every superblock unconditionally?
>
> > If there is wide consensus that all filesystems should have those stats
> > unconditionally (to mount options), then I can post another patch to make
> > the behavior not opt-in, but I have a feeling that this discussion
>
> That's exactly what I want you to do. We're already having this
> discussion, so let's get it over and done with right now.
>
> > How would you prefer the io stats behavior for xfs (or any fs) to be?
> > Unconditional to mount options?
> > Opt-in with mount option? (suggest name please)
> > Opt-in/out with mount options and default with Kconfig/sysfs tunable?
> > Anything else?
>
> Unconditional, for all filesystems, so they all display the same
> stats in exactly same place without any filesystem having to
> implement a single line of code anywhere. A single kconfig option
> like you already hav is just fine to turn it off for those that
> don't want to use it.
>

Very well then. I'll post this version with your Suggested-by ;-)

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v3 0/6] Generic per-sb io stats
  2022-03-02  8:26     ` Dave Chinner
  2022-03-02  8:34       ` Amir Goldstein
@ 2022-03-02 16:59       ` Amir Goldstein
  2022-03-02 21:12         ` Dave Chinner
  1 sibling, 1 reply; 22+ messages in thread
From: Amir Goldstein @ 2022-03-02 16:59 UTC (permalink / raw)
  To: Dave Chinner, Miklos Szeredi; +Cc: Al Viro, overlayfs, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 2870 bytes --]

On Wed, Mar 2, 2022 at 10:27 AM Dave Chinner <david@fromorbit.com> wrote:
>
> On Wed, Mar 02, 2022 at 09:43:50AM +0200, Amir Goldstein wrote:
> > On Wed, Mar 2, 2022 at 8:59 AM Dave Chinner <david@fromorbit.com> wrote:
> > >
> > > On Tue, Mar 01, 2022 at 08:42:15PM +0200, Amir Goldstein wrote:
> > > > Miklos,
> > > >
> > > > Following your feedback on v2 [1], I moved the iostats to per-sb.
> > > >
> > > > Thanks,
> > > > Amir.
> > > >
> > > > [1] https://lore.kernel.org/linux-unionfs/20220228113910.1727819-1-amir73il@gmail.com/
> > > >
> > > > Changes since v2:
> > > > - Change from per-mount to per-sb io stats (szeredi)
> > > > - Avoid percpu loop when reading mountstats (dchinner)
> > > >
> > > > Changes since v1:
> > > > - Opt-in for per-mount io stats for overlayfs and fuse
> > >
> > > Why make it optional only for specific filesystem types? Shouldn't
> > > every superblock capture these stats and export them in exactly the
> > > same place?
> > >
> >
> > I am not sure what you are asking.
> >
> > Any filesystem can opt-in to get those generic io stats.
>
> Yes, but why even make it opt-in? Why not just set these up
> unconditionally in alloc_super() for all filesystems? Either this is
> useful information for userspace montioring and diagnostics, or it's
> not useful at all. If it is useful, then all superblocks should
> export this stuff rather than just some special subset of
> filesystems where individual maintainers have noticed it and thought
> "that might be useful".
>
> Just enable it for every superblock....
>

Not that simple.
First of all alloc_super() is used for all sorts of internal kernel sb
(e.g. pipes) that really don't need those stats.

Second, counters can have performance impact.
Depending on the fs, overhead may or may not be significant.
I used the attached patch for testing and ran some micro benchmarks
on tmpfs (10M small read/writes).
The patch hacks -omand for enabling iostats [*]

The results were not great. up to 20% slower when io size > default
batch size (32).
Increasing the counter batch size for rchar/wchar to 1K fixed this
micro benchmark,
but it may not be a one size fits all situation.
So I'd rather be cautious and not enable the feature unconditionally.

Miklos,

In the patches to enable per-sb iostats for fuse and overlayfs I argued why
I think that iostats are more important for fuse/overlayfs than for
other local fs.

Do you agree with my arguments for either fuse/overlayfs or both?
If so, would you rather that per-sb iostats be enabled unconditionally for
fuse/overlayfs or via an opt-in mount option?

Thanks,
Amir.

[*] I tried to figure out how fsconfig() could be used to implement a new
generic sb property (i.e. SB_IOSTATS), but I failed to understand if and how
the new mount API design is intended to address new generic properties, so
I resorted to the SB_MANDLOCK hack.

[-- Attachment #2: v3-0004-fs-enable-per-sb-I-O-stats-for-any-filesystem.patch --]
[-- Type: text/x-patch, Size: 4461 bytes --]

From 669d899a9e40c2c519a850a65e5001f2b03b05a8 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 2 Mar 2022 11:03:29 +0200
Subject: [PATCH v3 7/6] fs: enable per-sb I/O stats for any filesystem

Override SB_MANDLOCK for testing.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/fuse/inode.c            |  8 ++++----
 fs/namespace.c             |  2 ++
 fs/overlayfs/super.c       |  4 ----
 fs/super.c                 |  3 +++
 include/linux/fs_iostats.h | 30 ++++++++++++++++++++++++++++++
 5 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f19c666b9ac3..4f0316709df0 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -145,8 +145,10 @@ static int fuse_reconfigure(struct fs_context *fsc)
 	struct super_block *sb = fsc->root->d_sb;
 
 	sync_filesystem(sb);
+#ifndef CONFIG_FS_IOSTATS
 	if (fsc->sb_flags & SB_MANDLOCK)
 		return -EINVAL;
+#endif
 
 	return 0;
 }
@@ -1514,13 +1516,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
 	struct dentry *root_dentry;
 	int err;
 
+#ifndef CONFIG_FS_IOSTATS
 	err = -EINVAL;
 	if (sb->s_flags & SB_MANDLOCK)
 		goto err;
-
-	err = sb_iostats_init(sb);
-	if (err && err != -EOPNOTSUPP)
-		goto err;
+#endif
 
 	rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc());
 	fuse_sb_defaults(sb);
diff --git a/fs/namespace.c b/fs/namespace.c
index de6fae84f1a1..eb4ebb5081d9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3298,8 +3298,10 @@ int path_mount(const char *dev_name, struct path *path,
 		return ret;
 	if (!may_mount())
 		return -EPERM;
+#ifndef CONFIG_FS_IOSTATS
 	if (flags & SB_MANDLOCK)
 		warn_mandlock();
+#endif
 
 	/* Default to relatime unless overriden */
 	if (!(flags & MS_NOATIME))
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 94ab6adebe07..d2f569260edc 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1980,10 +1980,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_d_op = &ovl_dentry_operations;
 
-	err = sb_iostats_init(sb);
-	if (err && err != -EOPNOTSUPP)
-		goto out;
-
 	err = -ENOMEM;
 	ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
 	if (!ofs)
diff --git a/fs/super.c b/fs/super.c
index c447cadb402b..ebaf650f72bf 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -180,6 +180,7 @@ static void destroy_unused_super(struct super_block *s)
 	up_write(&s->s_umount);
 	list_lru_destroy(&s->s_dentry_lru);
 	list_lru_destroy(&s->s_inode_lru);
+	sb_iostats_destroy(s);
 	security_sb_free(s);
 	put_user_ns(s->s_user_ns);
 	kfree(s->s_subtype);
@@ -1522,6 +1523,8 @@ int vfs_get_tree(struct fs_context *fc)
 	sb->s_flags |= SB_BORN;
 
 	error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL);
+	if (!error)
+		error = sb_iostats_configure(sb);
 	if (unlikely(error)) {
 		fc_drop_locked(fc);
 		return error;
diff --git a/include/linux/fs_iostats.h b/include/linux/fs_iostats.h
index 60d1efbea7d9..af5b5ff201aa 100644
--- a/include/linux/fs_iostats.h
+++ b/include/linux/fs_iostats.h
@@ -6,6 +6,13 @@
 #include <linux/percpu_counter.h>
 #include <linux/slab.h>
 
+/*
+ * Override SB_MANDLOCK for testing.
+ *
+ * TODO: use fsopen()/fsconfig() flag parameters?
+ */
+#define SB_IOSTATS	SB_MANDLOCK
+
 /* Similar to task_io_accounting members */
 enum {
 	SB_IOSTATS_CHARS_RD,	/* bytes read via syscalls */
@@ -47,6 +54,10 @@ static inline int sb_iostats_init(struct super_block *sb)
 {
 	int err;
 
+	/* Already initialized? */
+	if (sb->s_iostats)
+		return 0;
+
 	sb->s_iostats = kmalloc(sizeof(struct sb_iostats), GFP_KERNEL);
 	if (!sb->s_iostats)
 		return -ENOMEM;
@@ -60,6 +71,19 @@ static inline int sb_iostats_init(struct super_block *sb)
 	}
 
 	sb->s_iostats->start_time = ktime_get_seconds();
+	sb->s_flags |= SB_IOSTATS;
+	return 0;
+}
+
+/* Enable/disable according to SB_IOSTATS flag */
+static inline int sb_iostats_configure(struct super_block *sb)
+{
+	bool want_iostats = (sb->s_flags & SB_IOSTATS);
+
+	if (want_iostats && !sb->s_iostats)
+		return sb_iostats_init(sb);
+	else if (!want_iostats && sb->s_iostats)
+		sb_iostats_destroy(sb);
 	return 0;
 }
 
@@ -109,6 +133,12 @@ static inline void sb_iostats_destroy(struct super_block *sb)
 {
 }
 
+static inline int sb_iostats_configure(struct super_block *sb)
+{
+	sb->s_flags &= ~SB_IOSTATS;
+	return 0;
+}
+
 static inline void sb_iostats_counter_inc(struct super_block *sb, int id)
 {
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v3 0/6] Generic per-sb io stats
  2022-03-02 16:59       ` Amir Goldstein
@ 2022-03-02 21:12         ` Dave Chinner
  2022-03-02 22:04           ` nfs generic/373 failure after "fs: allow cross-vfsmount reflink/dedupe" J. Bruce Fields
  2022-03-03  6:50           ` [PATCH v3 0/6] Generic per-sb io stats Amir Goldstein
  0 siblings, 2 replies; 22+ messages in thread
From: Dave Chinner @ 2022-03-02 21:12 UTC (permalink / raw)
  To: Amir Goldstein; +Cc: Miklos Szeredi, Al Viro, overlayfs, linux-fsdevel

On Wed, Mar 02, 2022 at 06:59:51PM +0200, Amir Goldstein wrote:
> On Wed, Mar 2, 2022 at 10:27 AM Dave Chinner <david@fromorbit.com> wrote:
> >
> > On Wed, Mar 02, 2022 at 09:43:50AM +0200, Amir Goldstein wrote:
> > > On Wed, Mar 2, 2022 at 8:59 AM Dave Chinner <david@fromorbit.com> wrote:
> > > >
> > > > On Tue, Mar 01, 2022 at 08:42:15PM +0200, Amir Goldstein wrote:
> > > > > Miklos,
> > > > >
> > > > > Following your feedback on v2 [1], I moved the iostats to per-sb.
> > > > >
> > > > > Thanks,
> > > > > Amir.
> > > > >
> > > > > [1] https://lore.kernel.org/linux-unionfs/20220228113910.1727819-1-amir73il@gmail.com/
> > > > >
> > > > > Changes since v2:
> > > > > - Change from per-mount to per-sb io stats (szeredi)
> > > > > - Avoid percpu loop when reading mountstats (dchinner)
> > > > >
> > > > > Changes since v1:
> > > > > - Opt-in for per-mount io stats for overlayfs and fuse
> > > >
> > > > Why make it optional only for specific filesystem types? Shouldn't
> > > > every superblock capture these stats and export them in exactly the
> > > > same place?
> > > >
> > >
> > > I am not sure what you are asking.
> > >
> > > Any filesystem can opt-in to get those generic io stats.
> >
> > Yes, but why even make it opt-in? Why not just set these up
> > unconditionally in alloc_super() for all filesystems? Either this is
> > useful information for userspace montioring and diagnostics, or it's
> > not useful at all. If it is useful, then all superblocks should
> > export this stuff rather than just some special subset of
> > filesystems where individual maintainers have noticed it and thought
> > "that might be useful".
> >
> > Just enable it for every superblock....
> >
> 
> Not that simple.
> First of all alloc_super() is used for all sorts of internal kernel sb
> (e.g. pipes) that really don't need those stats.

Doesn't change anything - it still should be entirely set up and
managed by alloc_super/deactivate_locked_super.

If it really has to be selected by filesystem, alloc_super() has
a fstype passes to it and you can put a falg in the fstype to say
this is supported. Then filesystems only need to set a feature flag
to enable it, not have to manage allocation/freeing of something
that only the core VFS code uses.

> Second, counters can have performance impact.

So does adding branches for feature checks that nobody except some
special case uses.

But if the counters have perf overhead, then fix the counter
implementation to have less overhead.

> Depending on the fs, overhead may or may not be significant.
> I used the attached patch for testing and ran some micro benchmarks
> on tmpfs (10M small read/writes).
> The patch hacks -omand for enabling iostats [*]
> 
> The results were not great. up to 20% slower when io size > default
> batch size (32).
> Increasing the counter batch size for rchar/wchar to 1K fixed this
> micro benchmark,

Why do you think that is? Think about it: what size IO did you test?
I bet it was larger than 32 bytes and so it was forcing the
default generic percpu counter implementation to take a spin lock on
every syscall.  Yes?

Which means this addition will need to use a custom batch size for
*all* filesystems, and it will have to be substantially larger than
PAGE_SIZE because we need to amortise the cost of the global percpu
counter update across multiple IOs, not just one. IOWs, the counter
batch size likely needs to be set to several megabytes so that we
don't need to take the per-cpu spinlock in every decent sized IO
that applications issue.

So this isn't a negative against enabling the feature for all
superblocks - you just discovered a general problem because you
hadn't considered the impact of the counter implementation on 
high performance, high concurrency IO. overlay does get used in such
environments hence if the implementation isn't up to spec for
filesystems like XFS, tmpfs, etc that overlay might sit on top of
then it's not good enough for overlay, either.

> but it may not be a one size fits all situation.
> So I'd rather be cautious and not enable the feature unconditionally.

But now that you've realised that the counter configs need to be
specifically tuned for the use case and the perf overhead is pretty
much a non-issue, what's the argument against enabling it for
all superblocks?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 22+ messages in thread

* nfs generic/373 failure after "fs: allow cross-vfsmount reflink/dedupe"
  2022-03-02 21:12         ` Dave Chinner
@ 2022-03-02 22:04           ` J. Bruce Fields
  2022-03-02 22:26             ` Josef Bacik
  2022-03-03  6:50           ` [PATCH v3 0/6] Generic per-sb io stats Amir Goldstein
  1 sibling, 1 reply; 22+ messages in thread
From: J. Bruce Fields @ 2022-03-02 22:04 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-nfs, Josef Bacik, Chuck Lever

I started seeing generic/373 fail on recent linux-next in NFS testing.

Bisect lands it on aaf40970b1d0 "fs: allow cross-vfsmount
reflink/dedupe".

The test fails because a clone between two mounts is expected to fail,
and no longer does.

In my setup both mounts are nfs mounts.  They are mounts of different
exports, and the exports are exports of different filesystems.  So it
does make sense that the clone should fail.

I see the NFS client send a CLONE rpc to the server, and the server
return success.  That seems wrong.

Both exported filesystems are xfs, and from the code it looks like the
server calls vfs_clone_file_range(), which ends up calling
xfs_file_remap_range().

Are we missing a check now in that xfs case?

I haven't looked any more closely at what's going on, so I could be
missing something.

--b.

> > 
> > Sent from my iPhone
> > 
> > > On Mar 1, 2022, at 6:53 PM, J. Bruce Fields <bfields@fieldses.org> wrote:
> > > 
> > > This is probably moving the check for whether a cross-vfsmount reflink
> > > is allowed to someplace that makes less sense for NFS?
> > > 
> > > I did find the thread I was thinking of:
> > > 
> > >    https://lore.kernel.org/linux-fsdevel/67ae4c62a4749ae6870c452d1b458cc5f48b8263.1645042835.git.josef@toxicpanda.com/#r
> > > 
> > > though maybe they're discussing a different problem.
> > > 
> > > --b.
> > > 
> > >> On Tue, Mar 01, 2022 at 06:04:33PM -0500, J. Bruce Fields wrote:
> > >> aaf40970b1d0f4ac41dad7963f35c9e353b4a41d is the first bad commit
> > >> commit aaf40970b1d0f4ac41dad7963f35c9e353b4a41d
> > >> Author: Josef Bacik <josef@toxicpanda.com>
> > >> Date:   Fri Feb 18 09:38:14 2022 -0500
> > >> 
> > >>    fs: allow cross-vfsmount reflink/dedupe
> > >> 
> > >>    Currently we disallow reflink and dedupe if the two files aren't on the
> > >>    same vfsmount.  However we really only need to disallow it if they're
> > >>    not on the same super block.  It is very common for btrfs to have a main
> > >>    subvolume that is mounted and then different subvolumes mounted at
> > >>    different locations.  It's allowed to reflink between these volumes, but
> > >>    the vfsmount check disallows this.  Instead fix dedupe to check for the
> > >>    same superblock, and simply remove the vfsmount check for reflink as it
> > >>    already does the superblock check.
> > >> 
> > >>    Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> > >>    Reviewed-by: Nikolay Borisov <nborisov@suse.com>
> > >>    Reviewed-by: David Sterba <dsterba@suse.com>
> > >>    Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> > >>    Signed-off-by: David Sterba <dsterba@suse.com>
> > >> 
> > >> fs/ioctl.c       | 4 ----
> > >> fs/remap_range.c | 7 +------
> > >> 2 files changed, 1 insertion(+), 10 deletions(-)
> > >> bisect found first bad commitgit bisect start
> > >> # bad: [6705cd745adbbeac6b13002c7a30060f7b2568a5] Add linux-next specific files for 20220228
> > >> git bisect bad 6705cd745adbbeac6b13002c7a30060f7b2568a5
> > >> # good: [7e57714cd0ad2d5bb90e50b5096a0e671dec1ef3] Linux 5.17-rc6
> > >> git bisect good 7e57714cd0ad2d5bb90e50b5096a0e671dec1ef3
> > >> # bad: [b1c65e65460a75a16cb8b658a28dd10fe465c59c] Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git
> > >> git bisect bad b1c65e65460a75a16cb8b658a28dd10fe465c59c
> > >> # bad: [03de4e1d1f2cb2993df929a481661cdebc6c2c3d] Merge branch 'hwmon-next' of git://git.kernel.org/pub/scm/linux/kernel/git/groeck/linux-staging.git
> > >> git bisect bad 03de4e1d1f2cb2993df929a481661cdebc6c2c3d
> > >> # good: [6b9c42995b55b2d01731105ef85170944d8da96f] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap.git
> > >> git bisect good 6b9c42995b55b2d01731105ef85170944d8da96f
> > >> # good: [af69a7a5a64464a756328de668041c1075f86454] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/leo/linux.git
> > >> git bisect good af69a7a5a64464a756328de668041c1075f86454
> > >> # bad: [1f6aae68ded84c44b31249d2aae82be5ceacc758] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git
> > >> git bisect bad 1f6aae68ded84c44b31249d2aae82be5ceacc758
> > >> # bad: [25ebc69693daccd38953d627151dbebc369ea3ff] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git
> > >> git bisect bad 25ebc69693daccd38953d627151dbebc369ea3ff
> > >> # good: [0cbe7d755415ae2f40a8741eedb7c1717a21de53] btrfs: do not clean up repair bio if submit fails
> > >> git bisect good 0cbe7d755415ae2f40a8741eedb7c1717a21de53
> > >> # good: [5d820d692b5e550cdb590c362fdd35c000ebf420] Merge branch 'master' of git://git.kernel.org/pub/scm/fs/fscrypt/fscrypt.git
> > >> git bisect good 5d820d692b5e550cdb590c362fdd35c000ebf420
> > >> # good: [e9a9bca06a61e4700a8f81a1527c82966ed40922] btrfs: fix relocation crash due to premature return from btrfs_commit_transaction()
> > >> git bisect good e9a9bca06a61e4700a8f81a1527c82966ed40922
> > >> # bad: [d3cb500f9ca9e8a33a1e728cbd4c9c6e272f3fd8] Merge branch 'ext/qu/subpage-more-sizes' into for-next-next-v5.17-20220224
> > >> git bisect bad d3cb500f9ca9e8a33a1e728cbd4c9c6e272f3fd8
> > >> # bad: [fa3b92e259c8f71731e9bfb9cc6978d6184d8d8d] Merge branch 'ext/josef/cross-mount' into for-next-next-v5.17-20220224
> > >> git bisect bad fa3b92e259c8f71731e9bfb9cc6978d6184d8d8d
> > >> # bad: [aaf40970b1d0f4ac41dad7963f35c9e353b4a41d] fs: allow cross-vfsmount reflink/dedupe
> > >> git bisect bad aaf40970b1d0f4ac41dad7963f35c9e353b4a41d
> > >> # good: [244a73f987b06fd33041dc9cb0f99527a1c0815c] btrfs: remove the cross file system checks from remap
> > >> git bisect good 244a73f987b06fd33041dc9cb0f99527a1c0815c
> > >> # first bad commit: [aaf40970b1d0f4ac41dad7963f35c9e353b4a41d] fs: allow cross-vfsmount reflink/dedupe


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: nfs generic/373 failure after "fs: allow cross-vfsmount reflink/dedupe"
  2022-03-02 22:04           ` nfs generic/373 failure after "fs: allow cross-vfsmount reflink/dedupe" J. Bruce Fields
@ 2022-03-02 22:26             ` Josef Bacik
  2022-03-02 22:42               ` J. Bruce Fields
  0 siblings, 1 reply; 22+ messages in thread
From: Josef Bacik @ 2022-03-02 22:26 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: linux-fsdevel, linux-nfs, Chuck Lever

On Wed, Mar 02, 2022 at 05:04:50PM -0500, J. Bruce Fields wrote:
> I started seeing generic/373 fail on recent linux-next in NFS testing.
> 
> Bisect lands it on aaf40970b1d0 "fs: allow cross-vfsmount
> reflink/dedupe".
> 
> The test fails because a clone between two mounts is expected to fail,
> and no longer does.
> 
> In my setup both mounts are nfs mounts.  They are mounts of different
> exports, and the exports are exports of different filesystems.  So it
> does make sense that the clone should fail.
> 
> I see the NFS client send a CLONE rpc to the server, and the server
> return success.  That seems wrong.
> 
> Both exported filesystems are xfs, and from the code it looks like the
> server calls vfs_clone_file_range(), which ends up calling
> xfs_file_remap_range().
> 
> Are we missing a check now in that xfs case?
> 
> I haven't looked any more closely at what's going on, so I could be
> missing something.
> 

Yeah there's a few fstests that test this functionality that need to be removed,
I have patches pending for this in our fstests staging tree (since we run
fstests nightly on our tree)

https://github.com/btrfs/fstests/tree/staging

Right now the patches just remove the tests from auto since that's what we run,
I'll remove them properly once the patch lands in linus.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: nfs generic/373 failure after "fs: allow cross-vfsmount reflink/dedupe"
  2022-03-02 22:26             ` Josef Bacik
@ 2022-03-02 22:42               ` J. Bruce Fields
  2022-03-02 23:45                 ` Josef Bacik
  0 siblings, 1 reply; 22+ messages in thread
From: J. Bruce Fields @ 2022-03-02 22:42 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-fsdevel, linux-nfs, Chuck Lever

On Wed, Mar 02, 2022 at 05:26:08PM -0500, Josef Bacik wrote:
> On Wed, Mar 02, 2022 at 05:04:50PM -0500, J. Bruce Fields wrote:
> > I started seeing generic/373 fail on recent linux-next in NFS testing.
> > 
> > Bisect lands it on aaf40970b1d0 "fs: allow cross-vfsmount
> > reflink/dedupe".
> > 
> > The test fails because a clone between two mounts is expected to fail,
> > and no longer does.
> > 
> > In my setup both mounts are nfs mounts.  They are mounts of different
> > exports, and the exports are exports of different filesystems.  So it
> > does make sense that the clone should fail.
> > 
> > I see the NFS client send a CLONE rpc to the server, and the server
> > return success.  That seems wrong.
> > 
> > Both exported filesystems are xfs, and from the code it looks like the
> > server calls vfs_clone_file_range(), which ends up calling
> > xfs_file_remap_range().
> > 
> > Are we missing a check now in that xfs case?
> > 
> > I haven't looked any more closely at what's going on, so I could be
> > missing something.
> > 
> 
> Yeah there's a few fstests that test this functionality that need to be removed,
> I have patches pending for this in our fstests staging tree (since we run
> fstests nightly on our tree)
> 
> https://github.com/btrfs/fstests/tree/staging
> 
> Right now the patches just remove the tests from auto since that's what we run,
> I'll remove them properly once the patch lands in linus.  Thanks,

So, out of curiosity, what is xfs doing in this case?  These are two
filesystems on separate partitions, is it falling back on a read/write
loop or something?

--b.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: nfs generic/373 failure after "fs: allow cross-vfsmount reflink/dedupe"
  2022-03-02 22:42               ` J. Bruce Fields
@ 2022-03-02 23:45                 ` Josef Bacik
  2022-03-03  0:07                   ` J. Bruce Fields
  0 siblings, 1 reply; 22+ messages in thread
From: Josef Bacik @ 2022-03-02 23:45 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: linux-fsdevel, linux-nfs, Chuck Lever

On Wed, Mar 02, 2022 at 05:42:50PM -0500, J. Bruce Fields wrote:
> On Wed, Mar 02, 2022 at 05:26:08PM -0500, Josef Bacik wrote:
> > On Wed, Mar 02, 2022 at 05:04:50PM -0500, J. Bruce Fields wrote:
> > > I started seeing generic/373 fail on recent linux-next in NFS testing.
> > > 
> > > Bisect lands it on aaf40970b1d0 "fs: allow cross-vfsmount
> > > reflink/dedupe".
> > > 
> > > The test fails because a clone between two mounts is expected to fail,
> > > and no longer does.
> > > 
> > > In my setup both mounts are nfs mounts.  They are mounts of different
> > > exports, and the exports are exports of different filesystems.  So it
> > > does make sense that the clone should fail.
> > > 
> > > I see the NFS client send a CLONE rpc to the server, and the server
> > > return success.  That seems wrong.
> > > 
> > > Both exported filesystems are xfs, and from the code it looks like the
> > > server calls vfs_clone_file_range(), which ends up calling
> > > xfs_file_remap_range().
> > > 
> > > Are we missing a check now in that xfs case?
> > > 
> > > I haven't looked any more closely at what's going on, so I could be
> > > missing something.
> > > 
> > 
> > Yeah there's a few fstests that test this functionality that need to be removed,
> > I have patches pending for this in our fstests staging tree (since we run
> > fstests nightly on our tree)
> > 
> > https://github.com/btrfs/fstests/tree/staging
> > 
> > Right now the patches just remove the tests from auto since that's what we run,
> > I'll remove them properly once the patch lands in linus.  Thanks,
> 
> So, out of curiosity, what is xfs doing in this case?  These are two
> filesystems on separate partitions, is it falling back on a read/write
> loop or something?

I don't think so?  I'm actually kind of confused, because nfsd does
vfs_clone_file_range, and the only place I messed with for CLONE was
ioctl_clone_file, so the patch changed literally nothing, unless you aren't
using nfsd for the server?

And if they are in fact two different file systems the i_sb != i_sb of the
files, so there's something pretty strange going on here, my patch shouldn't
affect your setup.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: nfs generic/373 failure after "fs: allow cross-vfsmount reflink/dedupe"
  2022-03-02 23:45                 ` Josef Bacik
@ 2022-03-03  0:07                   ` J. Bruce Fields
  2022-03-03  0:29                     ` Josef Bacik
  0 siblings, 1 reply; 22+ messages in thread
From: J. Bruce Fields @ 2022-03-03  0:07 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-fsdevel, linux-nfs, Chuck Lever

On Wed, Mar 02, 2022 at 06:45:12PM -0500, Josef Bacik wrote:
> On Wed, Mar 02, 2022 at 05:42:50PM -0500, J. Bruce Fields wrote:
> > On Wed, Mar 02, 2022 at 05:26:08PM -0500, Josef Bacik wrote:
> > > On Wed, Mar 02, 2022 at 05:04:50PM -0500, J. Bruce Fields wrote:
> > > > I started seeing generic/373 fail on recent linux-next in NFS testing.
> > > > 
> > > > Bisect lands it on aaf40970b1d0 "fs: allow cross-vfsmount
> > > > reflink/dedupe".
> > > > 
> > > > The test fails because a clone between two mounts is expected to fail,
> > > > and no longer does.
> > > > 
> > > > In my setup both mounts are nfs mounts.  They are mounts of different
> > > > exports, and the exports are exports of different filesystems.  So it
> > > > does make sense that the clone should fail.
> > > > 
> > > > I see the NFS client send a CLONE rpc to the server, and the server
> > > > return success.  That seems wrong.
> > > > 
> > > > Both exported filesystems are xfs, and from the code it looks like the
> > > > server calls vfs_clone_file_range(), which ends up calling
> > > > xfs_file_remap_range().
> > > > 
> > > > Are we missing a check now in that xfs case?
> > > > 
> > > > I haven't looked any more closely at what's going on, so I could be
> > > > missing something.
> > > > 
> > > 
> > > Yeah there's a few fstests that test this functionality that need to be removed,
> > > I have patches pending for this in our fstests staging tree (since we run
> > > fstests nightly on our tree)
> > > 
> > > https://github.com/btrfs/fstests/tree/staging
> > > 
> > > Right now the patches just remove the tests from auto since that's what we run,
> > > I'll remove them properly once the patch lands in linus.  Thanks,
> > 
> > So, out of curiosity, what is xfs doing in this case?  These are two
> > filesystems on separate partitions, is it falling back on a read/write
> > loop or something?
> 
> I don't think so?  I'm actually kind of confused, because nfsd does
> vfs_clone_file_range, and the only place I messed with for CLONE was
> ioctl_clone_file, so the patch changed literally nothing, unless you aren't
> using nfsd for the server?
> 
> And if they are in fact two different file systems the i_sb != i_sb of the
> files, so there's something pretty strange going on here, my patch shouldn't
> affect your setup.  Thanks,

Sorry, took me a minute to understand, myself:

It's actually only the client behavior that changed.  Previously the
client would reject an attempt to clone across filesystems, so the
server never saw such a request.  After this patch, the client will go
ahead and send the CLONE.  (Which, come to think of it, is probably the
right thing for the client to do.)

So the server's probably always had a bug, and this just uncovered it.

I'd be curious what the consequences are.  And where the check should be
(above or below vfs_clone_file_range()?).

--b.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: nfs generic/373 failure after "fs: allow cross-vfsmount reflink/dedupe"
  2022-03-03  0:07                   ` J. Bruce Fields
@ 2022-03-03  0:29                     ` Josef Bacik
  2022-03-03  0:50                       ` J. Bruce Fields
  0 siblings, 1 reply; 22+ messages in thread
From: Josef Bacik @ 2022-03-03  0:29 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: linux-fsdevel, linux-nfs, Chuck Lever

On Wed, Mar 02, 2022 at 07:07:35PM -0500, J. Bruce Fields wrote:
> On Wed, Mar 02, 2022 at 06:45:12PM -0500, Josef Bacik wrote:
> > On Wed, Mar 02, 2022 at 05:42:50PM -0500, J. Bruce Fields wrote:
> > > On Wed, Mar 02, 2022 at 05:26:08PM -0500, Josef Bacik wrote:
> > > > On Wed, Mar 02, 2022 at 05:04:50PM -0500, J. Bruce Fields wrote:
> > > > > I started seeing generic/373 fail on recent linux-next in NFS testing.
> > > > > 
> > > > > Bisect lands it on aaf40970b1d0 "fs: allow cross-vfsmount
> > > > > reflink/dedupe".
> > > > > 
> > > > > The test fails because a clone between two mounts is expected to fail,
> > > > > and no longer does.
> > > > > 
> > > > > In my setup both mounts are nfs mounts.  They are mounts of different
> > > > > exports, and the exports are exports of different filesystems.  So it
> > > > > does make sense that the clone should fail.
> > > > > 
> > > > > I see the NFS client send a CLONE rpc to the server, and the server
> > > > > return success.  That seems wrong.
> > > > > 
> > > > > Both exported filesystems are xfs, and from the code it looks like the
> > > > > server calls vfs_clone_file_range(), which ends up calling
> > > > > xfs_file_remap_range().
> > > > > 
> > > > > Are we missing a check now in that xfs case?
> > > > > 
> > > > > I haven't looked any more closely at what's going on, so I could be
> > > > > missing something.
> > > > > 
> > > > 
> > > > Yeah there's a few fstests that test this functionality that need to be removed,
> > > > I have patches pending for this in our fstests staging tree (since we run
> > > > fstests nightly on our tree)
> > > > 
> > > > https://github.com/btrfs/fstests/tree/staging
> > > > 
> > > > Right now the patches just remove the tests from auto since that's what we run,
> > > > I'll remove them properly once the patch lands in linus.  Thanks,
> > > 
> > > So, out of curiosity, what is xfs doing in this case?  These are two
> > > filesystems on separate partitions, is it falling back on a read/write
> > > loop or something?
> > 
> > I don't think so?  I'm actually kind of confused, because nfsd does
> > vfs_clone_file_range, and the only place I messed with for CLONE was
> > ioctl_clone_file, so the patch changed literally nothing, unless you aren't
> > using nfsd for the server?
> > 
> > And if they are in fact two different file systems the i_sb != i_sb of the
> > files, so there's something pretty strange going on here, my patch shouldn't
> > affect your setup.  Thanks,
> 
> Sorry, took me a minute to understand, myself:
> 
> It's actually only the client behavior that changed.  Previously the
> client would reject an attempt to clone across filesystems, so the
> server never saw such a request.  After this patch, the client will go
> ahead and send the CLONE.  (Which, come to think of it, is probably the
> right thing for the client to do.)
> 
> So the server's probably always had a bug, and this just uncovered it.
> 
> I'd be curious what the consequences are.  And where the check should be
> (above or below vfs_clone_file_range()?).
> 

This is where I'm confused, this really shouldn't succeed

loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
                           struct file *file_out, loff_t pos_out,
                           loff_t len, unsigned int remap_flags)
{
        loff_t ret;

        WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);

        if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
                return -EXDEV;


loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
                            struct file *file_out, loff_t pos_out,
                            loff_t len, unsigned int remap_flags)
{
        loff_t ret;

        file_start_write(file_out);
        ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
                                  remap_flags);

And even if we get past here, I imagine XFS would freak out because it can't
find the extents (unless you're getting lucky and everything is lining up?).
I'm super confused...

Josef

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: nfs generic/373 failure after "fs: allow cross-vfsmount reflink/dedupe"
  2022-03-03  0:29                     ` Josef Bacik
@ 2022-03-03  0:50                       ` J. Bruce Fields
  2022-03-04 20:03                         ` J. Bruce Fields
  0 siblings, 1 reply; 22+ messages in thread
From: J. Bruce Fields @ 2022-03-03  0:50 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-fsdevel, linux-nfs, Chuck Lever

On Wed, Mar 02, 2022 at 07:29:34PM -0500, Josef Bacik wrote:
> On Wed, Mar 02, 2022 at 07:07:35PM -0500, J. Bruce Fields wrote:
> > Sorry, took me a minute to understand, myself:
> > 
> > It's actually only the client behavior that changed.  Previously the
> > client would reject an attempt to clone across filesystems, so the
> > server never saw such a request.  After this patch, the client will go
> > ahead and send the CLONE.  (Which, come to think of it, is probably the
> > right thing for the client to do.)
> > 
> > So the server's probably always had a bug, and this just uncovered it.
> > 
> > I'd be curious what the consequences are.  And where the check should be
> > (above or below vfs_clone_file_range()?).
> > 
> 
> This is where I'm confused, this really shouldn't succeed
> 
> loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
>                            struct file *file_out, loff_t pos_out,
>                            loff_t len, unsigned int remap_flags)
> {
>         loff_t ret;
> 
>         WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
> 
>         if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
>                 return -EXDEV;
> 
> 
> loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
>                             struct file *file_out, loff_t pos_out,
>                             loff_t len, unsigned int remap_flags)
> {
>         loff_t ret;
> 
>         file_start_write(file_out);
>         ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
>                                   remap_flags);
> 
> And even if we get past here, I imagine XFS would freak out because it can't
> find the extents (unless you're getting lucky and everything is lining up?).
> I'm super confused...

Bah, I see what you mean.  Maybe there's something wrong with my setup.
I'll try some more stuff and report back....

--b.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v3 0/6] Generic per-sb io stats
  2022-03-02 21:12         ` Dave Chinner
  2022-03-02 22:04           ` nfs generic/373 failure after "fs: allow cross-vfsmount reflink/dedupe" J. Bruce Fields
@ 2022-03-03  6:50           ` Amir Goldstein
  1 sibling, 0 replies; 22+ messages in thread
From: Amir Goldstein @ 2022-03-03  6:50 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Miklos Szeredi, Al Viro, overlayfs, linux-fsdevel

> > Not that simple.
> > First of all alloc_super() is used for all sorts of internal kernel sb
> > (e.g. pipes) that really don't need those stats.
>
> Doesn't change anything - it still should be entirely set up and
> managed by alloc_super/deactivate_locked_super.
>
> If it really has to be selected by filesystem, alloc_super() has
> a fstype passes to it and you can put a falg in the fstype to say
> this is supported. Then filesystems only need to set a feature flag
> to enable it, not have to manage allocation/freeing of something
> that only the core VFS code uses.
>

That sounds good to me.

It will still leave the decision per fs maintainer whether to support
sb iostats or not, but at least it won't cluttle mountstats with less
interesting stats from the many pseudo filesystem mounts.

I wish I had a good heuristic for which filesystems iostats should
be enabled. I was thinking for blockdev fs and for fs with private BDI
the iostats make more sense, because those fs are more likely to be
io intensive. Then I would not initialize iostats in alloc_super() but in
vfs_get_tree() as my test patch does.

Overlayfs does not have a private BDI yet [1], but overlayfs can use
an fstype flag to opt-in for iostats as you suggested.

[1] https://lore.kernel.org/linux-unionfs/20210923130814.140814-2-cgxu519@mykernel.net/

> > Second, counters can have performance impact.
>
> So does adding branches for feature checks that nobody except some
> special case uses.
>
> But if the counters have perf overhead, then fix the counter
> implementation to have less overhead.
>
> > Depending on the fs, overhead may or may not be significant.
> > I used the attached patch for testing and ran some micro benchmarks
> > on tmpfs (10M small read/writes).
> > The patch hacks -omand for enabling iostats [*]
> >
> > The results were not great. up to 20% slower when io size > default
> > batch size (32).
> > Increasing the counter batch size for rchar/wchar to 1K fixed this
> > micro benchmark,
>
> Why do you think that is? Think about it: what size IO did you test?
> I bet it was larger than 32 bytes and so it was forcing the
> default generic percpu counter implementation to take a spin lock on
> every syscall.  Yes?

Yes. I know why that is.

>
> Which means this addition will need to use a custom batch size for
> *all* filesystems, and it will have to be substantially larger than
> PAGE_SIZE because we need to amortise the cost of the global percpu
> counter update across multiple IOs, not just one. IOWs, the counter
> batch size likely needs to be set to several megabytes so that we
> don't need to take the per-cpu spinlock in every decent sized IO
> that applications issue.
>
> So this isn't a negative against enabling the feature for all
> superblocks - you just discovered a general problem because you
> hadn't considered the impact of the counter implementation on
> high performance, high concurrency IO. overlay does get used in such
> environments hence if the implementation isn't up to spec for
> filesystems like XFS, tmpfs, etc that overlay might sit on top of
> then it's not good enough for overlay, either.
>

I very much agree with you here. The batch size should be increased
to several MB, but that brings me back to your comment in the beginning
of this thread that I should use percpu_counter_read_positive() instead of
percpu_counter_sum_positive().

It's true that stats don't need to be accurate, but those stats are pretty
useful for sanity tests and testing some simple operations without ever
seeing the counters increase can be very confusing to users, especially
if megabytes are missing.

My instinct here is that I should use percpu_counter_sum_positive()
for mountstats and if someone reads this file too often, they should
pay the price for it. It is also not hard at all to spot the process and
function to blame for the extra CPU cycles, so unless this change is
going to regress a very common use case, I don't think this is going
to be a real life problem.

> > but it may not be a one size fits all situation.
> > So I'd rather be cautious and not enable the feature unconditionally.
>
> But now that you've realised that the counter configs need to be
> specifically tuned for the use case and the perf overhead is pretty
> much a non-issue, what's the argument against enabling it for
> all superblocks?
>

One argument still is the bloat of the mountstats file.
If people do not want iostats for pseudo fs I do not want to force
feed it to them.

I think the blockdev+private BDI+explicit opt-in by fstype flag
is a good balance.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: nfs generic/373 failure after "fs: allow cross-vfsmount reflink/dedupe"
  2022-03-03  0:50                       ` J. Bruce Fields
@ 2022-03-04 20:03                         ` J. Bruce Fields
  0 siblings, 0 replies; 22+ messages in thread
From: J. Bruce Fields @ 2022-03-04 20:03 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-fsdevel, linux-nfs, Chuck Lever

On Wed, Mar 02, 2022 at 07:50:01PM -0500, J. Bruce Fields wrote:
> On Wed, Mar 02, 2022 at 07:29:34PM -0500, Josef Bacik wrote:
> > On Wed, Mar 02, 2022 at 07:07:35PM -0500, J. Bruce Fields wrote:
> > > Sorry, took me a minute to understand, myself:
> > > 
> > > It's actually only the client behavior that changed.  Previously the
> > > client would reject an attempt to clone across filesystems, so the
> > > server never saw such a request.  After this patch, the client will go
> > > ahead and send the CLONE.  (Which, come to think of it, is probably the
> > > right thing for the client to do.)
> > > 
> > > So the server's probably always had a bug, and this just uncovered it.
> > > 
> > > I'd be curious what the consequences are.  And where the check should be
> > > (above or below vfs_clone_file_range()?).
> > > 
> > 
> > This is where I'm confused, this really shouldn't succeed
> > 
> > loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
> >                            struct file *file_out, loff_t pos_out,
> >                            loff_t len, unsigned int remap_flags)
> > {
> >         loff_t ret;
> > 
> >         WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
> > 
> >         if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
> >                 return -EXDEV;
> > 
> > 
> > loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
> >                             struct file *file_out, loff_t pos_out,
> >                             loff_t len, unsigned int remap_flags)
> > {
> >         loff_t ret;
> > 
> >         file_start_write(file_out);
> >         ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
> >                                   remap_flags);
> > 
> > And even if we get past here, I imagine XFS would freak out because it can't
> > find the extents (unless you're getting lucky and everything is lining up?).
> > I'm super confused...
> 
> Bah, I see what you mean.  Maybe there's something wrong with my setup.
> I'll try some more stuff and report back....

Sorry for the noise, you're right, generic/373 is just being dumb.

I assumed it was mounting different exports for some reason.  But in
fact it's just doing a bind mount and then "cp --reflink=always" between
two mounts of the same filesystem.  Previously that got rejected out of
hand, now the client sends a CLONE and the server handles it.

Which is an improvement.  So it's only generic/373 that needs fixing.

--b.

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2022-03-04 20:09 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-01 18:42 [PATCH v3 0/6] Generic per-sb io stats Amir Goldstein
2022-03-01 18:42 ` [PATCH v3 1/6] lib/percpu_counter: add helpers for arrays of counters Amir Goldstein
2022-03-01 18:42 ` [PATCH v3 2/6] fs: add optional iostats counters to struct super_block Amir Goldstein
2022-03-01 18:42 ` [PATCH v3 3/6] fs: collect per-sb io stats Amir Goldstein
2022-03-01 18:42 ` [PATCH v3 4/6] fs: report " Amir Goldstein
2022-03-01 18:42 ` [PATCH v3 5/6] ovl: opt-in for " Amir Goldstein
2022-03-01 18:42 ` [PATCH v3 6/6] fuse: " Amir Goldstein
2022-03-02  6:59 ` [PATCH v3 0/6] Generic " Dave Chinner
2022-03-02  7:43   ` Amir Goldstein
2022-03-02  8:26     ` Dave Chinner
2022-03-02  8:34       ` Amir Goldstein
2022-03-02 16:59       ` Amir Goldstein
2022-03-02 21:12         ` Dave Chinner
2022-03-02 22:04           ` nfs generic/373 failure after "fs: allow cross-vfsmount reflink/dedupe" J. Bruce Fields
2022-03-02 22:26             ` Josef Bacik
2022-03-02 22:42               ` J. Bruce Fields
2022-03-02 23:45                 ` Josef Bacik
2022-03-03  0:07                   ` J. Bruce Fields
2022-03-03  0:29                     ` Josef Bacik
2022-03-03  0:50                       ` J. Bruce Fields
2022-03-04 20:03                         ` J. Bruce Fields
2022-03-03  6:50           ` [PATCH v3 0/6] Generic per-sb io stats Amir Goldstein

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.