From: Suren Baghdasaryan <surenb@google.com>
To: akpm@linux-foundation.org
Cc: viro@zeniv.linux.org.uk, brauner@kernel.org, jack@suse.cz,
dchinner@redhat.com, casey@schaufler-ca.com,
ben.wolsieffer@hefring.com, paulmck@kernel.org,
david@redhat.com, avagin@google.com, usama.anjum@collabora.com,
peterx@redhat.com, hughd@google.com, ryan.roberts@arm.com,
wangkefeng.wang@huawei.com, Liam.Howlett@Oracle.com,
yuzhao@google.com, axelrasmussen@google.com, lstoakes@gmail.com,
talumbau@google.com, willy@infradead.org, vbabka@suse.cz,
mgorman@techsingularity.net, jhubbard@nvidia.com,
vishal.moola@gmail.com, mathieu.desnoyers@efficios.com,
dhowells@redhat.com, jgg@ziepe.ca, sidhartha.kumar@oracle.com,
andriy.shevchenko@linux.intel.com, yangxingui@huawei.com,
keescook@chromium.org, linux-kernel@vger.kernel.org,
linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
kernel-team@android.com, surenb@google.com
Subject: [RFC 3/3] mm/maps: read proc/pid/maps under RCU
Date: Mon, 15 Jan 2024 10:38:36 -0800 [thread overview]
Message-ID: <20240115183837.205694-4-surenb@google.com> (raw)
In-Reply-To: <20240115183837.205694-1-surenb@google.com>
With maple_tree supporting vma tree traversal under RCU and per-vma locks
making vma access RCU-safe, /proc/pid/maps can be read under RCU and
without the need to read-lock mmap_lock. However vma content can change
from under us, therefore we need to pin pointer fields used when
generating the output (currently only vm_file and anon_name).
In addition, we validate data before publishing it to the user using new
seq_file validate interface. This way we keep this mechanism consistent
with the previous behavior where data tearing is possible only at page
boundaries.
This change is designed to reduce mmap_lock contention and prevent a
process reading /proc/pid/maps files (often a low priority task, such as
monitoring/data collection services) from blocking address space updates.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
fs/proc/internal.h | 3 ++
fs/proc/task_mmu.c | 130 ++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 120 insertions(+), 13 deletions(-)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index a71ac5379584..47233408550b 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -290,6 +290,9 @@ struct proc_maps_private {
struct task_struct *task;
struct mm_struct *mm;
struct vma_iterator iter;
+ int mm_lock_seq;
+ struct anon_vma_name *anon_name;
+ struct file *vm_file;
#ifdef CONFIG_NUMA
struct mempolicy *task_mempolicy;
#endif
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 62b16f42d5d2..d4305cfdca58 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -141,6 +141,22 @@ static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
return vma;
}
+static const struct seq_operations proc_pid_maps_op;
+
+static inline bool needs_mmap_lock(struct seq_file *m)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+ /*
+ * smaps and numa_maps perform page table walk, therefore require
+ * mmap_lock but maps can be read under RCU.
+ */
+ return m->op != &proc_pid_maps_op;
+#else
+ /* Without per-vma locks VMA access is not RCU-safe */
+ return true;
+#endif
+}
+
static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
@@ -162,11 +178,17 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
return NULL;
}
- if (mmap_read_lock_killable(mm)) {
- mmput(mm);
- put_task_struct(priv->task);
- priv->task = NULL;
- return ERR_PTR(-EINTR);
+ if (needs_mmap_lock(m)) {
+ if (mmap_read_lock_killable(mm)) {
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
+ return ERR_PTR(-EINTR);
+ }
+ } else {
+ /* For memory barrier see the comment for mm_lock_seq in mm_struct */
+ priv->mm_lock_seq = smp_load_acquire(&priv->mm->mm_lock_seq);
+ rcu_read_lock();
}
vma_iter_init(&priv->iter, mm, last_addr);
@@ -195,7 +217,10 @@ static void m_stop(struct seq_file *m, void *v)
return;
release_task_mempolicy(priv);
- mmap_read_unlock(mm);
+ if (needs_mmap_lock(m))
+ mmap_read_unlock(mm);
+ else
+ rcu_read_unlock();
mmput(mm);
put_task_struct(priv->task);
priv->task = NULL;
@@ -283,8 +308,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
start = vma->vm_start;
end = vma->vm_end;
show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
- if (mm)
- anon_name = anon_vma_name(vma);
+ if (mm) {
+ anon_name = needs_mmap_lock(m) ? anon_vma_name(vma) :
+ anon_vma_name_get_rcu(vma);
+ }
/*
* Print the dentry name for named mappings, and a
@@ -338,19 +365,96 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
seq_puts(m, name);
}
seq_putc(m, '\n');
+ if (anon_name && !needs_mmap_lock(m))
+ anon_vma_name_put(anon_name);
+}
+
+/*
+ * Pin vm_area_struct fields used by show_map_vma. We also copy pinned fields
+ * into proc_maps_private because by the time put_vma_fields() is called, VMA
+ * might have changed and these fields might be pointing to different objects.
+ */
+static bool get_vma_fields(struct vm_area_struct *vma, struct proc_maps_private *priv)
+{
+ if (vma->vm_file) {
+ priv->vm_file = get_file_rcu(&vma->vm_file);
+ if (!priv->vm_file)
+ return false;
+
+ } else
+ priv->vm_file = NULL;
+
+ if (vma->anon_name) {
+ priv->anon_name = anon_vma_name_get_rcu(vma);
+ if (!priv->anon_name) {
+ if (priv->vm_file) {
+ fput(priv->vm_file);
+ return false;
+ }
+ }
+ } else
+ priv->anon_name = NULL;
+
+ return true;
+}
+
+static void put_vma_fields(struct proc_maps_private *priv)
+{
+ if (priv->anon_name)
+ anon_vma_name_put(priv->anon_name);
+ if (priv->vm_file)
+ fput(priv->vm_file);
}
static int show_map(struct seq_file *m, void *v)
{
- show_map_vma(m, v);
+ struct proc_maps_private *priv = m->private;
+
+ if (needs_mmap_lock(m))
+ show_map_vma(m, v);
+ else {
+ /*
+ * Stop immediately if the VMA changed from under us.
+ * Validation step will prevent publishing already cached data.
+ */
+ if (!get_vma_fields(v, priv))
+ return -EAGAIN;
+
+ show_map_vma(m, v);
+ put_vma_fields(priv);
+ }
+
return 0;
}
+static int validate_map(struct seq_file *m, void *v)
+{
+ if (!needs_mmap_lock(m)) {
+ struct proc_maps_private *priv = m->private;
+ int mm_lock_seq;
+
+ /* For memory barrier see the comment for mm_lock_seq in mm_struct */
+ mm_lock_seq = smp_load_acquire(&priv->mm->mm_lock_seq);
+ if (mm_lock_seq != priv->mm_lock_seq) {
+ /*
+ * mmap_lock contention is detected. Wait for mmap_lock
+ * write to be released, discard stale data and retry.
+ */
+ mmap_read_lock(priv->mm);
+ mmap_read_unlock(priv->mm);
+ return -EAGAIN;
+ }
+ }
+ return 0;
+
+}
+
static const struct seq_operations proc_pid_maps_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_map
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_map,
+ .validate = validate_map,
};
static int pid_maps_open(struct inode *inode, struct file *file)
--
2.43.0.381.gb435a96ce8-goog
next prev parent reply other threads:[~2024-01-15 18:38 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-01-15 18:38 [RFC 0/3] reading proc/pid/maps under RCU Suren Baghdasaryan
2024-01-15 18:38 ` [RFC 1/3] mm: make vm_area_struct anon_name field RCU-safe Suren Baghdasaryan
2024-01-15 18:38 ` [RFC 2/3] seq_file: add validate() operation to seq_operations Suren Baghdasaryan
2024-01-15 18:38 ` Suren Baghdasaryan [this message]
2024-01-16 14:42 ` [RFC 0/3] reading proc/pid/maps under RCU Vlastimil Babka
2024-01-16 14:46 ` Vlastimil Babka
2024-01-16 17:57 ` Suren Baghdasaryan
2024-01-18 17:58 ` Suren Baghdasaryan
2024-01-22 7:23 ` Suren Baghdasaryan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240115183837.205694-4-surenb@google.com \
--to=surenb@google.com \
--cc=Liam.Howlett@Oracle.com \
--cc=akpm@linux-foundation.org \
--cc=andriy.shevchenko@linux.intel.com \
--cc=avagin@google.com \
--cc=axelrasmussen@google.com \
--cc=ben.wolsieffer@hefring.com \
--cc=brauner@kernel.org \
--cc=casey@schaufler-ca.com \
--cc=david@redhat.com \
--cc=dchinner@redhat.com \
--cc=dhowells@redhat.com \
--cc=hughd@google.com \
--cc=jack@suse.cz \
--cc=jgg@ziepe.ca \
--cc=jhubbard@nvidia.com \
--cc=keescook@chromium.org \
--cc=kernel-team@android.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=lstoakes@gmail.com \
--cc=mathieu.desnoyers@efficios.com \
--cc=mgorman@techsingularity.net \
--cc=paulmck@kernel.org \
--cc=peterx@redhat.com \
--cc=ryan.roberts@arm.com \
--cc=sidhartha.kumar@oracle.com \
--cc=talumbau@google.com \
--cc=usama.anjum@collabora.com \
--cc=vbabka@suse.cz \
--cc=viro@zeniv.linux.org.uk \
--cc=vishal.moola@gmail.com \
--cc=wangkefeng.wang@huawei.com \
--cc=willy@infradead.org \
--cc=yangxingui@huawei.com \
--cc=yuzhao@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).