From: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
To: linux-kernel@vger.kernel.org
Cc: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>,
"Eric W . Biederman" <ebiederm@xmission.com>,
Manfred Spraul <manfred@colorfullife.com>,
Andrew Morton <akpm@linux-foundation.org>,
Davidlohr Bueso <dave@stgolabs.net>,
Johannes Weiner <hannes@cmpxchg.org>,
Michal Hocko <mhocko@suse.com>,
Vladimir Davydov <vdavydov.dev@gmail.com>,
Andrei Vagin <avagin@gmail.com>,
Christian Brauner <christian@brauner.io>,
Pavel Tikhomirov <ptikhomirov@virtuozzo.com>,
Alexander Mikhalitsyn <alexander@mihalicyn.com>
Subject: [RFC PATCH] shm: extend forced shm destroy to support objects from several IPC nses
Date: Wed, 14 Jul 2021 20:30:05 +0300 [thread overview]
Message-ID: <20210714173005.491941-1-alexander.mikhalitsyn@virtuozzo.com> (raw)
In-Reply-To: <CAJqdLrov0VBzHamSvMRKBHSJX+NROUx0TUsRD9U0zEWUn5XxDA@mail.gmail.com>
This is total rework of fix.
Thanks to Eric Biederman for suggestions (but may be I've misunderstood some of them :))
I've tested it with reproducer of the original problem. But of course it needs
detailed testing. I hope that I get some general comments about design and implementation.
ToDo: remove unneeded "ns" argument from shm_destroy, shm_rmid and other functions.
Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity")
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Christian Brauner <christian@brauner.io>
Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Cc: Alexander Mikhalitsyn <alexander@mihalicyn.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
---
include/linux/shm.h | 5 +-
ipc/shm.c | 128 +++++++++++++++++++++++++++++++-------------
2 files changed, 95 insertions(+), 38 deletions(-)
diff --git a/include/linux/shm.h b/include/linux/shm.h
index d8e69aed3d32..8053ed1433df 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -11,14 +11,15 @@ struct file;
#ifdef CONFIG_SYSVIPC
struct sysv_shm {
- struct list_head shm_clist;
+ spinlock_t shm_clist_lock;
+ struct list_head shm_clist;
};
long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr,
unsigned long shmlba);
bool is_file_shm_hugepages(struct file *file);
void exit_shm(struct task_struct *task);
-#define shm_init_task(task) INIT_LIST_HEAD(&(task)->sysvshm.shm_clist)
+void shm_init_task(struct task_struct *task);
#else
struct sysv_shm {
/* empty */
diff --git a/ipc/shm.c b/ipc/shm.c
index 748933e376ca..a746886a0e00 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -65,6 +65,7 @@ struct shmid_kernel /* private to the kernel */
/* The task created the shm object. NULL if the task is dead. */
struct task_struct *shm_creator;
struct list_head shm_clist; /* list by creator */
+ struct ipc_namespace *ns;
} __randomize_layout;
/* shm_mode upper byte flags */
@@ -115,6 +116,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
struct shmid_kernel *shp;
shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+ BUG_ON(shp->ns && ns != shp->ns);
if (shp->shm_nattch) {
shp->shm_perm.mode |= SHM_DEST;
@@ -225,9 +227,32 @@ static void shm_rcu_free(struct rcu_head *head)
kfree(shp);
}
+static inline void task_shm_clist_lock(struct task_struct *task)
+{
+ spin_lock(&task->sysvshm.shm_clist_lock);
+}
+
+static inline void task_shm_clist_unlock(struct task_struct *task)
+{
+ spin_unlock(&task->sysvshm.shm_clist_lock);
+}
+
+void shm_clist_rm(struct shmid_kernel *shp)
+{
+ if (!shp->shm_creator)
+ return;
+
+ task_shm_clist_lock(shp->shm_creator);
+ list_del_init(&shp->shm_clist);
+ task_shm_clist_unlock(shp->shm_creator);
+ shp->shm_creator = NULL;
+}
+
static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
{
- list_del(&s->shm_clist);
+ WARN_ON(s->ns && ns != s->ns);
+ //list_del_init(&s->shm_clist);
+ shm_clist_rm(s);
ipc_rmid(&shm_ids(ns), &s->shm_perm);
}
@@ -306,10 +331,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
*
* 2) sysctl kernel.shm_rmid_forced is set to 1.
*/
-static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
+static bool shm_may_destroy(struct shmid_kernel *shp)
{
return (shp->shm_nattch == 0) &&
- (ns->shm_rmid_forced ||
+ (shp->ns->shm_rmid_forced ||
(shp->shm_perm.mode & SHM_DEST));
}
@@ -340,7 +365,7 @@ static void shm_close(struct vm_area_struct *vma)
ipc_update_pid(&shp->shm_lprid, task_tgid(current));
shp->shm_dtim = ktime_get_real_seconds();
shp->shm_nattch--;
- if (shm_may_destroy(ns, shp))
+ if (shm_may_destroy(shp))
shm_destroy(ns, shp);
else
shm_unlock(shp);
@@ -361,10 +386,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data)
*
* As shp->* are changed under rwsem, it's safe to skip shp locking.
*/
- if (shp->shm_creator != NULL)
+ if (!list_empty(&shp->shm_clist))
return 0;
- if (shm_may_destroy(ns, shp)) {
+ if (shm_may_destroy(shp)) {
shm_lock_by_ptr(shp);
shm_destroy(ns, shp);
}
@@ -379,51 +404,77 @@ void shm_destroy_orphaned(struct ipc_namespace *ns)
up_write(&shm_ids(ns).rwsem);
}
+void shm_init_task(struct task_struct *task)
+{
+ INIT_LIST_HEAD(&(task)->sysvshm.shm_clist);
+ spin_lock_init(&task->sysvshm.shm_clist_lock);
+}
+
/* Locking assumes this will only be called with task == current */
void exit_shm(struct task_struct *task)
{
- struct ipc_namespace *ns = task->nsproxy->ipc_ns;
+ LIST_HEAD(tmp);
struct shmid_kernel *shp, *n;
if (list_empty(&task->sysvshm.shm_clist))
return;
- /*
- * If kernel.shm_rmid_forced is not set then only keep track of
- * which shmids are orphaned, so that a later set of the sysctl
- * can clean them up.
- */
- if (!ns->shm_rmid_forced) {
- down_read(&shm_ids(ns).rwsem);
- list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
- shp->shm_creator = NULL;
- /*
- * Only under read lock but we are only called on current
- * so no entry on the list will be shared.
- */
- list_del(&task->sysvshm.shm_clist);
- up_read(&shm_ids(ns).rwsem);
- return;
- }
+ rcu_read_lock(); /* for refcount_inc_not_zero */
+ task_shm_clist_lock(task);
- /*
- * Destroy all already created segments, that were not yet mapped,
- * and mark any mapped as orphan to cover the sysctl toggling.
- * Destroy is skipped if shm_may_destroy() returns false.
- */
- down_write(&shm_ids(ns).rwsem);
list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
+ struct ipc_namespace *ns = shp->ns;
+
+ /*
+ * Remove shm from task list and nullify shm_creator which
+ * marks object as orphaned.
+ *
+ * If kernel.shm_rmid_forced is not set then only keep track of
+ * which shmids are orphaned, so that a later set of the sysctl
+ * can clean them up.
+ */
+ list_del_init(&shp->shm_clist);
shp->shm_creator = NULL;
- if (shm_may_destroy(ns, shp)) {
- shm_lock_by_ptr(shp);
- shm_destroy(ns, shp);
+ printk("exit_shm() %px refcnt=%u, id=%d,key=%x\n", shp,
+ refcount_read(&shp->shm_perm.refcount),
+ shp->shm_perm.id, shp->shm_perm.key
+ );
+
+ /*
+ * Will destroy all already created segments, that were not yet mapped,
+ * and mark any mapped as orphan to cover the sysctl toggling.
+ * Destroy is skipped if shm_may_destroy() returns false.
+ */
+ if (shp->ns->shm_rmid_forced && shm_may_destroy(shp)) {
+ /*
+ * We may race with shm_exit_ns() if refcounter
+ * already zero. Let's skip shm_destroy() of such
+ * shm object as it will be destroyed during shm_exit_ns()
+ */
+ if (!refcount_inc_not_zero(&ns->ns.count)) /* get_ipc_ns */
+ continue;
+
+ list_add(&shp->shm_clist, &tmp);
}
}
- /* Remove the list head from any segments still attached. */
list_del(&task->sysvshm.shm_clist);
- up_write(&shm_ids(ns).rwsem);
+ task_shm_clist_unlock(task);
+ rcu_read_unlock();
+
+ list_for_each_entry_safe(shp, n, &tmp, shm_clist) {
+ struct ipc_namespace *ns = shp->ns;
+
+ list_del_init(&shp->shm_clist);
+
+ down_write(&shm_ids(ns).rwsem);
+ shm_lock_by_ptr(shp);
+ /* will do put_ipc_ns(shp->ns) */
+ shm_destroy(ns, shp);
+ up_write(&shm_ids(ns).rwsem);
+ put_ipc_ns(ns); /* see refcount_inc_not_zero */
+ }
}
static vm_fault_t shm_fault(struct vm_fault *vmf)
@@ -681,6 +732,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
goto no_id;
list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist);
+ shp->ns = ns;
/*
* shmid gets reported as "inode#" in /proc/pid/maps.
@@ -1573,7 +1625,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
down_write(&shm_ids(ns).rwsem);
shp = shm_lock(ns, shmid);
shp->shm_nattch--;
- if (shm_may_destroy(ns, shp))
+#if 0
+ if (shp->shm_nattch)
+ list_del_init(&shp->shm_clist);
+#endif
+ if (shm_may_destroy(shp))
shm_destroy(ns, shp);
else
shm_unlock(shp);
--
2.31.1
next prev parent reply other threads:[~2021-07-14 17:30 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-07-06 13:22 [PATCH 0/2] shm: omit forced shm destroy if task IPC namespace was changed Alexander Mikhalitsyn
2021-07-06 13:22 ` [PATCH 1/2] shm: skip shm_destroy " Alexander Mikhalitsyn
2021-07-06 13:22 ` [PATCH 2/2] ipc: WARN if trying to remove ipc object which is absent Alexander Mikhalitsyn
2021-07-10 1:12 ` [PATCH 0/2] shm: omit forced shm destroy if task IPC namespace was changed Andrew Morton
2021-07-10 10:55 ` Alexander Mihalicyn
[not found] ` <CALgW_8VUk0us_umLncUv2DUMkOi3qixmT+YkHV4Dhpt_nNMZHw@mail.gmail.com>
2021-07-11 10:33 ` Alexander Mihalicyn
2021-07-11 11:46 ` Manfred Spraul
2021-07-12 9:54 ` Alexander Mihalicyn
2021-07-12 19:18 ` Eric W. Biederman
2021-07-12 19:27 ` Alexander Mihalicyn
2021-07-14 17:30 ` Alexander Mikhalitsyn [this message]
2021-07-21 6:32 ` [RFC PATCH] shm: extend forced shm destroy to support objects from several IPC nses Manfred Spraul
2021-07-22 18:46 ` Manfred Spraul
2021-07-30 14:52 ` Alexander Mihalicyn
2021-07-14 17:42 ` [PATCH 0/2] shm: omit forced shm destroy if task IPC namespace was changed Alexander Mihalicyn
2021-09-23 16:36 ` Manfred Spraul
2021-09-24 16:45 ` Eric W. Biederman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210714173005.491941-1-alexander.mikhalitsyn@virtuozzo.com \
--to=alexander.mikhalitsyn@virtuozzo.com \
--cc=akpm@linux-foundation.org \
--cc=alexander@mihalicyn.com \
--cc=avagin@gmail.com \
--cc=christian@brauner.io \
--cc=dave@stgolabs.net \
--cc=ebiederm@xmission.com \
--cc=hannes@cmpxchg.org \
--cc=linux-kernel@vger.kernel.org \
--cc=manfred@colorfullife.com \
--cc=mhocko@suse.com \
--cc=ptikhomirov@virtuozzo.com \
--cc=vdavydov.dev@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).