From: John Ogness <john.ogness@linutronix.de>
To: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
Peter Zijlstra <peterz@infradead.org>,
Thomas Gleixner <tglx@linutronix.de>,
linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [RFC PATCH 1/1] fs/namespace.c: use spinlock instead of busy loop
Date: Wed, 17 Jun 2020 12:46:58 +0206 [thread overview]
Message-ID: <20200617104058.14902-2-john.ogness@linutronix.de> (raw)
In-Reply-To: <20200617104058.14902-1-john.ogness@linutronix.de>
The MNT_WRITE_HOLD flag is used to manually implement a per-cpu
optimized rwsem using busy waiting. This implementation is a problem
on PREEMPT_RT because write_seqlock() on @mount_lock (i.e. taking a
spinlock) does not disable preemption. This allows a writer to
preempt a task that has set MNT_WRITE_HOLD and thus that writer will
live lock in __mnt_want_write() due to busy looping with preemption
disabled.
Reimplement the same semantics using per-cpu spinlocks. This
provides lockdep coverage and makes the code RT ready.
Since this reverts some of the optimization work of
commit d3ef3d7351cc ("fs: mnt_want_write speedup")
lmbench lat_mmap tests were performed to verify that there is no
obvious performance degradation.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
---
Here is the detailed test information...
TEST COMMAND
lat_mmap -P $pval -W 32 -N 50 64m file
OUTPUT FORMAT
pval: avg std
RESULTS (32 CPUs)
No Forced Preemption
BEFORE AFTER
1: 275.60 1.82 274.40 0.55
2: 296.20 3.83 286.80 1.92
4: 310.20 4.44 304.40 2.51
8: 359.20 2.28 357.80 2.95
16: 417.40 2.51 412.20 3.90
32: 625.60 2.07 622.00 3.08
64: 1202.60 15.87 1202.20 6.14
No Forced Preemption, no PTI
BEFORE AFTER
1: 278.00 2.12 274.40 1.52
2: 301.00 3.67 289.80 6.06
4: 333.40 7.73 303.80 2.39
8: 389.80 3.56 351.80 3.42
16: 425.00 3.46 408.20 4.87
32: 606.00 1.22 605.60 1.82
64: 1193.60 7.09 1184.80 4.27
Voluntary Kernel Preemption
BEFORE AFTER
1: 277.80 1.30 278.20 1.10
2: 291.20 0.84 286.60 2.30
4: 310.00 1.87 304.80 1.30
8: 360.00 2.55 354.60 1.14
16: 414.00 1.58 414.00 2.35
32: 619.60 5.50 607.00 3.74
64: 1224.00 8.40 1219.60 6.19
Voluntary Kernel Preemption, no PTI
BEFORE AFTER
1: 277.80 4.66 276.40 0.89
2: 291.40 6.54 286.40 3.58
4: 310.00 1.22 315.40 1.14
8: 357.20 0.84 361.40 2.61
16: 405.60 2.88 407.60 2.51
32: 615.40 2.30 611.60 5.55
64: 1219.80 9.91 1207.40 10.88
Preemptible Kernel
BEFORE AFTER
1: 283.80 0.45 286.80 0.84
2: 293.40 2.51 294.40 3.51
4: 318.20 1.30 315.60 1.95
8: 367.00 0.71 363.00 1.22
16: 416.20 1.64 413.20 4.87
32: 628.80 2.28 617.40 2.97
64: 1277.20 9.88 1254.20 4.87
Preemptible Kernel, no PTI
BEFORE AFTER
1: 283.00 1.73 288.40 1.67
2: 305.80 2.05 297.00 3.24
4: 321.40 4.34 318.60 2.79
8: 370.20 2.39 366.40 2.70
16: 413.20 3.11 412.40 2.41
32: 616.40 2.61 620.20 2.05
64: 1266.00 6.48 1255.80 3.90
fs/mount.h | 7 +++
fs/namespace.c | 118 +++++++++++++++++++++++++++++++++----------------
2 files changed, 86 insertions(+), 39 deletions(-)
diff --git a/fs/mount.h b/fs/mount.h
index c7abb7b394d8..627e635cd7d1 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -28,6 +28,12 @@ struct mnt_namespace {
struct mnt_pcp {
int mnt_count;
int mnt_writers;
+ /*
+ * If holding multiple instances of this lock, they
+ * must be ordered by cpu number.
+ */
+ spinlock_t mnt_writers_lock;
+ struct lock_class_key mnt_writers_lock_key;
};
struct mountpoint {
@@ -51,6 +57,7 @@ struct mount {
#else
int mnt_count;
int mnt_writers;
+ spinlock_t mnt_writers_lock;
#endif
struct list_head mnt_mounts; /* list of children, anchored here */
struct list_head mnt_child; /* and going through their mnt_child */
diff --git a/fs/namespace.c b/fs/namespace.c
index f30ed401cc6d..e292c91f966d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -177,6 +177,7 @@ static struct mount *alloc_vfsmnt(const char *name)
struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
if (mnt) {
int err;
+ int cpu;
err = mnt_alloc_id(mnt);
if (err)
@@ -194,9 +195,21 @@ static struct mount *alloc_vfsmnt(const char *name)
goto out_free_devname;
this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
+
+ for_each_possible_cpu(cpu) {
+ struct mnt_pcp *writer =
+ per_cpu_ptr(mnt->mnt_pcp, cpu);
+
+ lockdep_register_key(&writer->mnt_writers_lock_key);
+ spin_lock_init(&writer->mnt_writers_lock);
+ lockdep_set_class(&writer->mnt_writers_lock,
+ &writer->mnt_writers_lock_key);
+ }
#else
+ (void)cpu;
mnt->mnt_count = 1;
mnt->mnt_writers = 0;
+ spin_lock_init(&mnt->mnt_writers_lock);
#endif
INIT_HLIST_NODE(&mnt->mnt_hash);
@@ -311,29 +324,26 @@ static int mnt_is_readonly(struct vfsmount *mnt)
int __mnt_want_write(struct vfsmount *m)
{
struct mount *mnt = real_mount(m);
+ spinlock_t *lock;
int ret = 0;
- preempt_disable();
- mnt_inc_writers(mnt);
- /*
- * The store to mnt_inc_writers must be visible before we pass
- * MNT_WRITE_HOLD loop below, so that the slowpath can see our
- * incremented count after it has set MNT_WRITE_HOLD.
- */
- smp_mb();
- while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
- cpu_relax();
- /*
- * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
- * be set to match its requirements. So we must not load that until
- * MNT_WRITE_HOLD is cleared.
- */
- smp_rmb();
- if (mnt_is_readonly(m)) {
- mnt_dec_writers(mnt);
+#ifdef CONFIG_SMP
+ migrate_disable();
+ lock = &this_cpu_ptr(mnt->mnt_pcp)->mnt_writers_lock;
+#else
+ lock = &mnt->mnt_writers_lock;
+#endif
+
+ spin_lock(lock);
+ if (mnt_is_readonly(m))
ret = -EROFS;
- }
- preempt_enable();
+ else
+ mnt_inc_writers(mnt);
+ spin_unlock(lock);
+
+#ifdef CONFIG_SMP
+ migrate_enable();
+#endif
return ret;
}
@@ -459,17 +469,39 @@ void mnt_drop_write_file(struct file *file)
}
EXPORT_SYMBOL(mnt_drop_write_file);
+static void mnt_lock_writers(struct mount *mnt)
+{
+#ifdef CONFIG_SMP
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ spin_lock(&per_cpu_ptr(mnt->mnt_pcp,
+ cpu)->mnt_writers_lock);
+ }
+#else
+ spin_lock(&mnt->mnt_writers_lock);
+#endif
+}
+
+static void mnt_unlock_writers(struct mount *mnt)
+{
+#ifdef CONFIG_SMP
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ spin_unlock(&per_cpu_ptr(mnt->mnt_pcp,
+ cpu)->mnt_writers_lock);
+ }
+#else
+ spin_unlock(&mnt->mnt_writers_lock);
+#endif
+}
+
static int mnt_make_readonly(struct mount *mnt)
{
int ret = 0;
lock_mount_hash();
- mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
- /*
- * After storing MNT_WRITE_HOLD, we'll read the counters. This store
- * should be visible before we do.
- */
- smp_mb();
/*
* With writers on hold, if this value is zero, then there are
@@ -482,21 +514,17 @@ static int mnt_make_readonly(struct mount *mnt)
* sum up each counter, if we read a counter before it is incremented,
* but then read another CPU's count which it has been subsequently
* decremented from -- we would see more decrements than we should.
- * MNT_WRITE_HOLD protects against this scenario, because
- * mnt_want_write first increments count, then smp_mb, then spins on
- * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
- * we're counting up here.
+ * mnt_writers_lock protects against this scenario, because all CPUs
+ * are prevented from incrementing the counter until the summation of
+ * all CPU counters is complete.
*/
+ mnt_lock_writers(mnt);
if (mnt_get_writers(mnt) > 0)
ret = -EBUSY;
else
mnt->mnt.mnt_flags |= MNT_READONLY;
- /*
- * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
- * that become unheld will see MNT_READONLY.
- */
- smp_wmb();
- mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
+ mnt_unlock_writers(mnt);
+
unlock_mount_hash();
return ret;
}
@@ -514,15 +542,15 @@ int sb_prepare_remount_readonly(struct super_block *sb)
struct mount *mnt;
int err = 0;
- /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
+ /* Racy optimization. Recheck the counter under mnt_writers_lock. */
if (atomic_long_read(&sb->s_remove_count))
return -EBUSY;
lock_mount_hash();
list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
+ mnt_lock_writers(mnt);
mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
- smp_mb();
if (mnt_get_writers(mnt) > 0) {
err = -EBUSY;
break;
@@ -537,8 +565,10 @@ int sb_prepare_remount_readonly(struct super_block *sb)
smp_wmb();
}
list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
- if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
+ if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) {
mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
+ mnt_unlock_writers(mnt);
+ }
}
unlock_mount_hash();
@@ -1099,6 +1129,7 @@ static void cleanup_mnt(struct mount *mnt)
{
struct hlist_node *p;
struct mount *m;
+ int cpu;
/*
* The warning here probably indicates that somebody messed
* up a mnt_want/drop_write() pair. If this happens, the
@@ -1117,6 +1148,15 @@ static void cleanup_mnt(struct mount *mnt)
dput(mnt->mnt.mnt_root);
deactivate_super(mnt->mnt.mnt_sb);
mnt_free_id(mnt);
+#ifdef CONFIG_SMP
+ for_each_possible_cpu(cpu) {
+ struct mnt_pcp *writer = per_cpu_ptr(mnt->mnt_pcp, cpu);
+
+ lockdep_unregister_key(&writer->mnt_writers_lock_key);
+ }
+#else
+ (void)cpu;
+#endif
call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
}
--
2.20.1
next prev parent reply other threads:[~2020-06-17 10:41 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-06-17 10:40 [RFC PATCH 0/1] fs: remove retry loop John Ogness
2020-06-17 10:40 ` John Ogness [this message]
2020-06-17 11:26 ` [RFC PATCH 1/1] fs/namespace.c: use spinlock instead of busy loop Peter Zijlstra
2020-06-17 11:27 ` Peter Zijlstra
2020-06-17 12:09 ` Sebastian Andrzej Siewior
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200617104058.14902-2-john.ogness@linutronix.de \
--to=john.ogness@linutronix.de \
--cc=bigeasy@linutronix.de \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=peterz@infradead.org \
--cc=tglx@linutronix.de \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.