All of lore.kernel.org
 help / color / mirror / Atom feed
* Support for SEM_UNDO, round 2
@ 2010-08-04 17:02 Dan Smith
       [not found] ` <1280941345-27566-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Dan Smith @ 2010-08-04 17:02 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

This set includes a change the the base ipc/sem.c code to store a
pointer to the namespace to which the undo list belongs, as well as
the allocation function changes from the previous version of the
patch to match.  The second patch integrates the feedback from the
list and passes all my tests.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/2] Add ipc_namespace to struct sem_undo_list
       [not found] ` <1280941345-27566-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2010-08-04 17:02   ` Dan Smith
  2010-08-04 17:02   ` [PATCH 2/2] Add support for the per-task sem_undo list (v2) Dan Smith
  1 sibling, 0 replies; 9+ messages in thread
From: Dan Smith @ 2010-08-04 17:02 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

Checkpoint/Restart needs to have a pointer to the ipc_namespace that the
sem_undo_list applies to in order to properly bring up and tear down
the object hash.  This patch adds a pointer to the namespace to the list
structure, as well as breaks out the allocation of the undo list to a
separate function (which is needed in a later C/R patch anyway).

Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 include/linux/sem.h |    2 ++
 ipc/sem.c           |   30 +++++++++++++++++++++---------
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/include/linux/sem.h b/include/linux/sem.h
index 8a4adbe..8cf9636 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -127,12 +127,14 @@ struct sem_undo {
 	short *			semadj;		/* array of adjustments, one per semaphore */
 };
 
+struct ipc_namespace;
 /* sem_undo_list controls shared access to the list of sem_undo structures
  * that may be shared among all a CLONE_SYSVSEM task group.
  */ 
 struct sem_undo_list {
 	atomic_t		refcnt;
 	spinlock_t		lock;
+	struct ipc_namespace	*ipc_ns;
 	struct list_head	list_proc;
 };
 
diff --git a/ipc/sem.c b/ipc/sem.c
index 37da85e..e439b73 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -983,6 +983,21 @@ asmlinkage long SyS_semctl(int semid, int semnum, int cmd, union semun arg)
 SYSCALL_ALIAS(sys_semctl, SyS_semctl);
 #endif
 
+static struct sem_undo_list *alloc_undo_list(struct ipc_namespace *ipc_ns)
+{
+	struct sem_undo_list *undo_list;
+
+	undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL);
+	if (undo_list == NULL)
+		return NULL;
+	spin_lock_init(&undo_list->lock);
+	atomic_set(&undo_list->refcnt, 1);
+	INIT_LIST_HEAD(&undo_list->list_proc);
+	undo_list->ipc_ns = ipc_ns;
+
+	return undo_list;
+}
+
 /* If the task doesn't already have a undo_list, then allocate one
  * here.  We guarantee there is only one thread using this undo list,
  * and current is THE ONE
@@ -994,19 +1009,16 @@ SYSCALL_ALIAS(sys_semctl, SyS_semctl);
  *
  * This can block, so callers must hold no locks.
  */
-static inline int get_undo_list(struct sem_undo_list **undo_listp)
+static inline int get_undo_list(struct sem_undo_list **undo_listp,
+				struct ipc_namespace *ipc_ns)
 {
 	struct sem_undo_list *undo_list;
 
 	undo_list = current->sysvsem.undo_list;
 	if (!undo_list) {
-		undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL);
-		if (undo_list == NULL)
+		undo_list = alloc_undo_list(ipc_ns);
+		if (!undo_list)
 			return -ENOMEM;
-		spin_lock_init(&undo_list->lock);
-		atomic_set(&undo_list->refcnt, 1);
-		INIT_LIST_HEAD(&undo_list->list_proc);
-
 		current->sysvsem.undo_list = undo_list;
 	}
 	*undo_listp = undo_list;
@@ -1057,7 +1069,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
 	int nsems;
 	int error;
 
-	error = get_undo_list(&ulp);
+	error = get_undo_list(&ulp, ns);
 	if (error)
 		return ERR_PTR(error);
 
@@ -1328,7 +1340,7 @@ int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
 	int error;
 
 	if (clone_flags & CLONE_SYSVSEM) {
-		error = get_undo_list(&undo_list);
+		error = get_undo_list(&undo_list, tsk->nsproxy->ipc_ns);
 		if (error)
 			return error;
 		atomic_inc(&undo_list->refcnt);
-- 
1.7.1.1

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/2] Add support for the per-task sem_undo list (v2)
       [not found] ` <1280941345-27566-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2010-08-04 17:02   ` [PATCH 1/2] Add ipc_namespace to struct sem_undo_list Dan Smith
@ 2010-08-04 17:02   ` Dan Smith
       [not found]     ` <1280941345-27566-3-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 9+ messages in thread
From: Dan Smith @ 2010-08-04 17:02 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

The semaphore undo list is a set of adjustments to be made to semaphores
held by a task on exit.  Right now, we do not checkpoint or restore this
list which could cause undesirable behavior by a restarted process on exit.

Changes in v2:
 - Remove collect operation
 - Add a BUILD_BUG_ON() to ensure sizeof(short) == sizeof(__u16)
 - Use sizeof(__u16) when copying to/from checkpoint header
 - Fix a couple of leaked hdr objects
 - Avoid reading the semadj buffer with rcu_read_lock() held
 - Set the sem_undo pointer on tasks other than the first to restore a list
 - Fix refcounting on restart
 - Pull out the guts of exit_sem() into put_undo_list() and call that
   from our drop() function in case we're the last one.

Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 include/linux/checkpoint.h     |    4 +
 include/linux/checkpoint_hdr.h |   18 ++
 ipc/sem.c                      |  342 +++++++++++++++++++++++++++++++++++++++-
 kernel/checkpoint/process.c    |   13 ++
 4 files changed, 369 insertions(+), 8 deletions(-)

diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 4e25042..34e81f4 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -271,6 +271,10 @@ extern int ckpt_collect_fs(struct ckpt_ctx *ctx, struct task_struct *t);
 extern int checkpoint_obj_fs(struct ckpt_ctx *ctx, struct task_struct *t);
 extern int restore_obj_fs(struct ckpt_ctx *ctx, int fs_objref);
 
+/* per-task semaphore undo */
+int checkpoint_obj_sem_undo(struct ckpt_ctx *ctx, struct task_struct *t);
+int restore_obj_sem_undo(struct ckpt_ctx *ctx, int sem_undo_objref);
+
 /* memory */
 extern void ckpt_pgarr_free(struct ckpt_ctx *ctx);
 
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index f4f9577..b48fa37 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -167,6 +167,10 @@ enum {
 #define CKPT_HDR_IPC_MSG_MSG CKPT_HDR_IPC_MSG_MSG
 	CKPT_HDR_IPC_SEM,
 #define CKPT_HDR_IPC_SEM CKPT_HDR_IPC_SEM
+	CKPT_HDR_TASK_SEM_UNDO_LIST,
+#define CKPT_HDR_TASK_SEM_UNDO_LIST CKPT_HDR_TASK_SEM_UNDO_LIST
+	CKPT_HDR_TASK_SEM_UNDO,
+#define CKPT_HDR_TASK_SEM_UNDO CKPT_HDR_TASK_SEM_UNDO
 
 	CKPT_HDR_SIGHAND = 601,
 #define CKPT_HDR_SIGHAND CKPT_HDR_SIGHAND
@@ -273,6 +277,8 @@ enum obj_type {
 #define CKPT_OBJ_NET_NS CKPT_OBJ_NET_NS
 	CKPT_OBJ_NETDEV,
 #define CKPT_OBJ_NETDEV CKPT_OBJ_NETDEV
+	CKPT_OBJ_SEM_UNDO,
+#define CKPT_OBJ_SEM_UNDO CKPT_OBJ_SEM_UNDO
 	CKPT_OBJ_MAX
 #define CKPT_OBJ_MAX CKPT_OBJ_MAX
 };
@@ -461,6 +467,17 @@ struct ckpt_hdr_ns {
 	__s32 net_objref;
 } __attribute__((aligned(8)));
 
+struct ckpt_hdr_task_sem_undo_list {
+	struct ckpt_hdr h;
+	__u32 count;
+};
+
+struct ckpt_hdr_task_sem_undo {
+	struct ckpt_hdr h;
+	__u32 semid;
+	__u32 semadj_count;
+};
+
 /* cannot include <linux/tty.h> from userspace, so define: */
 #define CKPT_NEW_UTS_LEN  64
 #ifdef __KERNEL__
@@ -487,6 +504,7 @@ struct ckpt_hdr_task_objs {
 	__s32 files_objref;
 	__s32 mm_objref;
 	__s32 fs_objref;
+	__s32 sem_undo_objref;
 	__s32 sighand_objref;
 	__s32 signal_objref;
 } __attribute__((aligned(8)));
diff --git a/ipc/sem.c b/ipc/sem.c
index e439b73..3f74a53 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -132,12 +132,56 @@ void sem_exit_ns(struct ipc_namespace *ns)
 }
 #endif
 
+static int obj_sem_undo_grab(void *ptr)
+{
+	struct sem_undo_list *ulp = ptr;
+
+	atomic_inc(&ulp->refcnt);
+	return 0;
+}
+
+static void put_undo_list(struct sem_undo_list *ulp);
+
+static void obj_sem_undo_drop(void *ptr, int lastref)
+{
+	struct sem_undo_list *ulp = ptr;
+
+	put_undo_list(ulp);
+}
+
+static int obj_sem_undo_users(void *ptr)
+{
+	struct sem_undo_list *ulp = ptr;
+
+	return atomic_read(&ulp->refcnt);
+}
+
+static void *restore_sem_undo(struct ckpt_ctx *ctx);
+static int checkpoint_sem_undo(struct ckpt_ctx *ctx, void *ptr);
+
+static const struct ckpt_obj_ops ckpt_obj_sem_undo_ops = {
+	.obj_name = "IPC_SEM_UNDO",
+	.obj_type = CKPT_OBJ_SEM_UNDO,
+	.ref_drop = obj_sem_undo_drop,
+	.ref_grab = obj_sem_undo_grab,
+	.ref_users = obj_sem_undo_users,
+	.checkpoint = checkpoint_sem_undo,
+	.restore = restore_sem_undo,
+};
+
 void __init sem_init (void)
 {
 	sem_init_ns(&init_ipc_ns);
 	ipc_init_proc_interface("sysvipc/sem",
 				"       key      semid perms      nsems   uid   gid  cuid  cgid      otime      ctime\n",
 				IPC_SEM_IDS, sysvipc_sem_proc_show);
+
+	/* sem_undo_list          uses a short
+	*  ckpt_hdr_task_sem_undo uses a __u16
+	*/
+	BUILD_BUG_ON(sizeof(short) != sizeof(__u16));
+
+	register_checkpoint_obj(&ckpt_obj_sem_undo_ops);
 }
 
 /*
@@ -1363,14 +1407,8 @@ int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
  * The current implementation does not do so. The POSIX standard
  * and SVID should be consulted to determine what behavior is mandated.
  */
-void exit_sem(struct task_struct *tsk)
+static void put_undo_list(struct sem_undo_list *ulp)
 {
-	struct sem_undo_list *ulp;
-
-	ulp = tsk->sysvsem.undo_list;
-	if (!ulp)
-		return;
-	tsk->sysvsem.undo_list = NULL;
 
 	if (!atomic_dec_and_test(&ulp->refcnt))
 		return;
@@ -1393,7 +1431,7 @@ void exit_sem(struct task_struct *tsk)
 		if (semid == -1)
 			break;
 
-		sma = sem_lock_check(tsk->nsproxy->ipc_ns, un->semid);
+		sma = sem_lock_check(ulp->ipc_ns, un->semid);
 
 		/* exit_sem raced with IPC_RMID, nothing to do */
 		if (IS_ERR(sma))
@@ -1451,6 +1489,16 @@ void exit_sem(struct task_struct *tsk)
 	kfree(ulp);
 }
 
+void exit_sem(struct task_struct *tsk)
+{
+	struct sem_undo_list *ulp = tsk->sysvsem.undo_list;
+
+	if (ulp) {
+		put_undo_list(ulp);
+		tsk->sysvsem.undo_list = NULL;
+	}
+}
+
 #ifdef CONFIG_PROC_FS
 static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
 {
@@ -1470,3 +1518,281 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
 			  sma->sem_ctime);
 }
 #endif
+
+static int __get_task_semids(struct sem_undo_list *ulp, int *semids, int max)
+{
+	int count = 0;
+	struct sem_undo *un;
+
+	if (!ulp)
+		return 0;
+
+	spin_lock(&ulp->lock);
+	list_for_each_entry_rcu(un, &ulp->list_proc, list_proc) {
+		if (count >= max) {
+			count = -E2BIG;
+			break;
+		}
+		semids[count++] = un->semid;
+	}
+	spin_unlock(&ulp->lock);
+
+	return count;
+}
+
+static int get_task_semids(struct sem_undo_list *ulp, int **semid_listp)
+{
+	int ret;
+	int max = 32;
+	int *semid_list = NULL;
+ retry:
+	*semid_listp = krealloc(semid_list, max * sizeof(int), GFP_KERNEL);
+	if (!*semid_listp) {
+		kfree(semid_list);
+		return -ENOMEM;
+	}
+	semid_list = *semid_listp;
+
+	ret = __get_task_semids(ulp, semid_list, max);
+	if (ret == -E2BIG) {
+		max *= 2;
+		goto retry;
+	} else if (ret < 0) {
+		kfree(semid_list);
+		*semid_listp = NULL;
+	}
+
+	return ret;
+}
+
+int checkpoint_sem_undo_adj(struct ckpt_ctx *ctx, struct sem_undo *un)
+{
+	int nsems;
+	int ret;
+	short *semadj = NULL;
+	struct sem_array *sma;
+	struct ckpt_hdr_task_sem_undo *h = NULL;
+
+	sma = sem_lock(ctx->root_nsproxy->ipc_ns, un->semid);
+	if (IS_ERR(sma)) {
+		ckpt_debug("unable to lock semid %i (wrong ns?)\n", un->semid);
+		return PTR_ERR(sma);
+	}
+
+	nsems = sma->sem_nsems;
+	sem_getref_and_unlock(sma);
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK_SEM_UNDO);
+	if (!h)
+		goto putref_abort;
+
+	semadj = kzalloc(nsems * sizeof(short), GFP_KERNEL);
+	if (!semadj)
+		goto putref_abort;
+
+	sem_lock_and_putref(sma);
+
+	h->semid = un->semid;
+	h->semadj_count = nsems;
+	memcpy(semadj, un->semadj, h->semadj_count * sizeof(__u16));
+
+	sem_unlock(sma);
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *)h);
+	if (ret == 0)
+		ret = ckpt_write_buffer(ctx, semadj, nsems * sizeof(short));
+
+	kfree(semadj);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+
+ putref_abort:
+	sem_putref(sma);
+	if (h)
+		ckpt_hdr_put(ctx, h);
+
+	return -ENOMEM;
+}
+
+int write_sem_undo_list(struct ckpt_ctx *ctx, struct sem_undo_list *ulp,
+			int count, int *semids)
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < count; i++) {
+		struct sem_undo *un;
+
+		spin_lock(&ulp->lock);
+		un = lookup_undo(ulp, semids[i]);
+		spin_unlock(&ulp->lock);
+
+		if (!un) {
+			ckpt_debug("unable to lookup semid %i\n", semids[i]);
+			return -EINVAL;
+		}
+
+		ret = checkpoint_sem_undo_adj(ctx, un);
+		ckpt_debug("checkpoint_sem_undo: %i\n", ret);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int checkpoint_sem_undo(struct ckpt_ctx *ctx, void *ptr)
+{
+	int ret;
+	int *semids = NULL;
+	struct sem_undo_list *ulp = ptr;
+	struct ckpt_hdr_task_sem_undo_list *h;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK_SEM_UNDO_LIST);
+	if (!h)
+		return -ENOMEM;
+
+	ret = get_task_semids(ulp, &semids);
+	if (ret < 0)
+		goto out;
+	h->count = ret;
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *)h);
+	if (ret < 0)
+		goto out;
+
+	ret = write_sem_undo_list(ctx, ulp, h->count, semids);
+ out:
+	ckpt_hdr_put(ctx, h);
+	kfree(semids);
+
+	return ret;
+}
+
+int checkpoint_obj_sem_undo(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct sem_undo_list *ulp;
+
+	ulp = t->sysvsem.undo_list;
+	if (ulp)
+		return checkpoint_obj(ctx, ulp, CKPT_OBJ_SEM_UNDO);
+
+	return 0;
+}
+
+/* Count the number of sems for the given sem_undo->semid */
+static int sem_undo_nsems(struct sem_undo *un, struct ipc_namespace *ns)
+{
+	struct sem_array *sma;
+	int nsems;
+
+	sma = sem_lock(ns, un->semid);
+	if (IS_ERR(sma))
+		return PTR_ERR(sma);
+
+	nsems = sma->sem_nsems;
+	sem_unlock(sma);
+
+	return nsems;
+}
+
+static int restore_task_sem_undo_adj(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_task_sem_undo *h;
+	int len;
+	int ret = -ENOMEM;
+	struct sem_undo *un;
+	int nsems;
+	short *semadj = NULL;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK_SEM_UNDO);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	len = sizeof(__u16) * h->semadj_count;
+	semadj = kzalloc(len, GFP_KERNEL);
+	if (!semadj)
+		goto out;
+
+	ret = _ckpt_read_buffer(ctx, semadj, len);
+	if (ret < 0)
+		goto out;
+
+	un = find_alloc_undo(current->nsproxy->ipc_ns, h->semid);
+	if (IS_ERR(un)) {
+		ret = PTR_ERR(un);
+		ckpt_debug("unable to find semid %i\n", h->semid);
+		goto out;
+	}
+
+	nsems = sem_undo_nsems(un, current->nsproxy->ipc_ns);
+	len = sizeof(short) * nsems;
+	memcpy(un->semadj, semadj, len);
+	rcu_read_unlock();
+
+	if (nsems != h->semadj_count)
+		ckpt_err(ctx, -EINVAL,
+			 "semid %i has nmsems=%i but %i undo adjustments\n",
+			 h->semid, nsems, h->semadj_count);
+	else
+		ckpt_debug("semid %i restored with %i adjustments\n",
+			   h->semid, h->semadj_count);
+ out:
+	ckpt_hdr_put(ctx, h);
+	kfree(semadj);
+
+	return ret;
+}
+
+static void *restore_sem_undo(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_task_sem_undo_list *h;
+	struct sem_undo_list *ulp = NULL;
+	int i;
+	int ret = 0;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK_SEM_UNDO_LIST);
+	if (IS_ERR(h))
+		return ERR_PTR(PTR_ERR(h));
+
+	ulp = alloc_undo_list(current->nsproxy->ipc_ns);
+	if (!ulp) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < h->count; i++) {
+		ret = restore_task_sem_undo_adj(ctx);
+		if (ret < 0)
+			goto out;
+	}
+ out:
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	else
+		return ulp;
+}
+
+int restore_obj_sem_undo(struct ckpt_ctx *ctx, int sem_undo_objref)
+{
+	struct sem_undo_list *ulp;
+
+	if (!sem_undo_objref)
+		return 0; /* Task had no undo list */
+
+	ulp = ckpt_obj_try_fetch(ctx, sem_undo_objref, CKPT_OBJ_SEM_UNDO);
+	if (IS_ERR(ulp))
+		return PTR_ERR(ulp);
+
+	/* The first task to restore a shared list should already have this,
+	 * but subsequent ones won't, so attach to current in that case
+	 */
+	if (!current->sysvsem.undo_list)
+		current->sysvsem.undo_list = ulp;
+
+	atomic_inc(&ulp->refcnt);
+
+	return 0;
+}
diff --git a/kernel/checkpoint/process.c b/kernel/checkpoint/process.c
index 936675a..4ec9cdd 100644
--- a/kernel/checkpoint/process.c
+++ b/kernel/checkpoint/process.c
@@ -236,6 +236,7 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
 	int files_objref;
 	int mm_objref;
 	int fs_objref;
+	int sem_undo_objref;
 	int sighand_objref;
 	int signal_objref;
 	int first, ret;
@@ -283,6 +284,12 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
 		return fs_objref;
 	}
 
+	sem_undo_objref = checkpoint_obj_sem_undo(ctx, t);
+	if (sem_undo_objref < 0) {
+		ckpt_err(ctx, sem_undo_objref, "%(T)process sem_undo\n");
+		return sem_undo_objref;
+	}
+
 	sighand_objref = checkpoint_obj_sighand(ctx, t);
 	ckpt_debug("sighand: objref %d\n", sighand_objref);
 	if (sighand_objref < 0) {
@@ -311,6 +318,7 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
 	h->files_objref = files_objref;
 	h->mm_objref = mm_objref;
 	h->fs_objref = fs_objref;
+	h->sem_undo_objref = sem_undo_objref;
 	h->sighand_objref = sighand_objref;
 	h->signal_objref = signal_objref;
 	ret = ckpt_write_obj(ctx, &h->h);
@@ -679,6 +687,11 @@ static int restore_task_objs(struct ckpt_ctx *ctx)
 	if (ret < 0)
 		return ret;
 
+	ret = restore_obj_sem_undo(ctx, h->sem_undo_objref);
+	ckpt_debug("sem_undo: ret %d\n", ret);
+	if (ret < 0)
+		return ret;
+
 	ret = restore_obj_sighand(ctx, h->sighand_objref);
 	ckpt_debug("sighand: ret %d (%p)\n", ret, current->sighand);
 	if (ret < 0)
-- 
1.7.1.1

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] Add support for the per-task sem_undo list (v2)
       [not found]     ` <1280941345-27566-3-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2010-08-04 18:49       ` Oren Laadan
       [not found]         ` <4C59B625.7000007-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Oren Laadan @ 2010-08-04 18:49 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg



On 08/04/2010 01:02 PM, Dan Smith wrote:
> The semaphore undo list is a set of adjustments to be made to semaphores
> held by a task on exit.  Right now, we do not checkpoint or restore this
> list which could cause undesirable behavior by a restarted process on exit.
>
> Changes in v2:
>   - Remove collect operation
>   - Add a BUILD_BUG_ON() to ensure sizeof(short) == sizeof(__u16)
>   - Use sizeof(__u16) when copying to/from checkpoint header
>   - Fix a couple of leaked hdr objects
>   - Avoid reading the semadj buffer with rcu_read_lock() held
>   - Set the sem_undo pointer on tasks other than the first to restore a list
>   - Fix refcounting on restart
>   - Pull out the guts of exit_sem() into put_undo_list() and call that
>     from our drop() function in case we're the last one.
>
> Signed-off-by: Dan Smith<danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

Looks better. Still some more comments...

> ---
>   include/linux/checkpoint.h     |    4 +
>   include/linux/checkpoint_hdr.h |   18 ++
>   ipc/sem.c                      |  342 +++++++++++++++++++++++++++++++++++++++-
>   kernel/checkpoint/process.c    |   13 ++
>   4 files changed, 369 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index 4e25042..34e81f4 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -271,6 +271,10 @@ extern int ckpt_collect_fs(struct ckpt_ctx *ctx, struct task_struct *t);
>   extern int checkpoint_obj_fs(struct ckpt_ctx *ctx, struct task_struct *t);
>   extern int restore_obj_fs(struct ckpt_ctx *ctx, int fs_objref);
>
> +/* per-task semaphore undo */
> +int checkpoint_obj_sem_undo(struct ckpt_ctx *ctx, struct task_struct *t);
> +int restore_obj_sem_undo(struct ckpt_ctx *ctx, int sem_undo_objref);

Please add "extern" here (been burned before)

[...]

> +static int obj_sem_undo_grab(void *ptr)
> +{
> +	struct sem_undo_list *ulp = ptr;
> +
> +	atomic_inc(&ulp->refcnt);
> +	return 0;
> +}
> +
> +static void put_undo_list(struct sem_undo_list *ulp);

nit: to me it makes more sense to move grab/drop/checkpoint/restart
code, as well sem_init(), to the end of the file (and avoid those
statics...).

> +static void obj_sem_undo_drop(void *ptr, int lastref)
> +{
> +	struct sem_undo_list *ulp = ptr;
> +
> +	put_undo_list(ulp);
> +}
> +
> +static int obj_sem_undo_users(void *ptr)
> +{
> +	struct sem_undo_list *ulp = ptr;
> +
> +	return atomic_read(&ulp->refcnt);
> +}

Can get rid of ..._users() since we don't collect them.

> +
> +static void *restore_sem_undo(struct ckpt_ctx *ctx);
> +static int checkpoint_sem_undo(struct ckpt_ctx *ctx, void *ptr);
> +
> +static const struct ckpt_obj_ops ckpt_obj_sem_undo_ops = {
> +	.obj_name = "IPC_SEM_UNDO",
> +	.obj_type = CKPT_OBJ_SEM_UNDO,
> +	.ref_drop = obj_sem_undo_drop,
> +	.ref_grab = obj_sem_undo_grab,
> +	.ref_users = obj_sem_undo_users,

(here too)

> +	.checkpoint = checkpoint_sem_undo,
> +	.restore = restore_sem_undo,
> +};
> +
>   void __init sem_init (void)
>   {
>   	sem_init_ns(&init_ipc_ns);
>   	ipc_init_proc_interface("sysvipc/sem",
>   				"       key      semid perms      nsems   uid   gid  cuid  cgid      otime      ctime\n",
>   				IPC_SEM_IDS, sysvipc_sem_proc_show);
> +
> +	/* sem_undo_list          uses a short
> +	*  ckpt_hdr_task_sem_undo uses a __u16
> +	*/
> +	BUILD_BUG_ON(sizeof(short) != sizeof(__u16));

semadj is short, so:
s/__u16/__s16/

> +
> +	register_checkpoint_obj(&ckpt_obj_sem_undo_ops);
>   }
>
>   /*
> @@ -1363,14 +1407,8 @@ int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
>    * The current implementation does not do so. The POSIX standard
>    * and SVID should be consulted to determine what behavior is mandated.
>    */
> -void exit_sem(struct task_struct *tsk)
> +static void put_undo_list(struct sem_undo_list *ulp)
>   {
> -	struct sem_undo_list *ulp;
> -
> -	ulp = tsk->sysvsem.undo_list;
> -	if (!ulp)
> -		return;
> -	tsk->sysvsem.undo_list = NULL;
>
>   	if (!atomic_dec_and_test(&ulp->refcnt))
>   		return;
> @@ -1393,7 +1431,7 @@ void exit_sem(struct task_struct *tsk)
>   		if (semid == -1)
>   			break;
>
> -		sma = sem_lock_check(tsk->nsproxy->ipc_ns, un->semid);
> +		sma = sem_lock_check(ulp->ipc_ns, un->semid);

This hunk belongs to previous patch, no ?

[...]

> +int checkpoint_sem_undo_adj(struct ckpt_ctx *ctx, struct sem_undo *un)
> +{
> +	int nsems;
> +	int ret;
> +	short *semadj = NULL;
> +	struct sem_array *sma;
> +	struct ckpt_hdr_task_sem_undo *h = NULL;
> +
> +	sma = sem_lock(ctx->root_nsproxy->ipc_ns, un->semid);
> +	if (IS_ERR(sma)) {
> +		ckpt_debug("unable to lock semid %i (wrong ns?)\n", un->semid);
> +		return PTR_ERR(sma);
> +	}
> +
> +	nsems = sma->sem_nsems;
> +	sem_getref_and_unlock(sma);
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK_SEM_UNDO);
> +	if (!h)
> +		goto putref_abort;
> +
> +	semadj = kzalloc(nsems * sizeof(short), GFP_KERNEL);
> +	if (!semadj)
> +		goto putref_abort;
> +
> +	sem_lock_and_putref(sma);
> +
> +	h->semid = un->semid;
> +	h->semadj_count = nsems;
> +	memcpy(semadj, un->semadj, h->semadj_count * sizeof(__u16));

s/u__u16/__s16/

> +
> +	sem_unlock(sma);
> +
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *)h);
> +	if (ret == 0)
> +		ret = ckpt_write_buffer(ctx, semadj, nsems * sizeof(short));

.... s/short/__s16/

[...]

> +static int restore_task_sem_undo_adj(struct ckpt_ctx *ctx)
> +{
> +	struct ckpt_hdr_task_sem_undo *h;
> +	int len;
> +	int ret = -ENOMEM;
> +	struct sem_undo *un;
> +	int nsems;
> +	short *semadj = NULL;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK_SEM_UNDO);
> +	if (IS_ERR(h))
> +		return PTR_ERR(h);
> +
> +	len = sizeof(__u16) * h->semadj_count;

s/__u16/__s16/

> +	semadj = kzalloc(len, GFP_KERNEL);
> +	if (!semadj)
> +		goto out;
> +
> +	ret = _ckpt_read_buffer(ctx, semadj, len);
> +	if (ret<  0)
> +		goto out;

Use ckpt_read_payload() - it already allocates the payload buffer.

> +
> +	un = find_alloc_undo(current->nsproxy->ipc_ns, h->semid);
> +	if (IS_ERR(un)) {
> +		ret = PTR_ERR(un);
> +		ckpt_debug("unable to find semid %i\n", h->semid);
> +		goto out;
> +	}
> +
> +	nsems = sem_undo_nsems(un, current->nsproxy->ipc_ns);
> +	len = sizeof(short) * nsems;

s/short/__s16/

If nsems > h->semadj_count, we may be accessing bad memory in
the semadj buffer from above, therefore ...

> +	memcpy(un->semadj, semadj, len);
> +	rcu_read_unlock();
> +
> +	if (nsems != h->semadj_count)

... this test should occur before the memcpy() above.

> +		ckpt_err(ctx, -EINVAL,
> +			 "semid %i has nmsems=%i but %i undo adjustments\n",
> +			 h->semid, nsems, h->semadj_count);
> +	else
> +		ckpt_debug("semid %i restored with %i adjustments\n",
> +			   h->semid, h->semadj_count);
> + out:
> +	ckpt_hdr_put(ctx, h);
> +	kfree(semadj);
> +
> +	return ret;
> +}
> +
> +static void *restore_sem_undo(struct ckpt_ctx *ctx)
> +{
> +	struct ckpt_hdr_task_sem_undo_list *h;
> +	struct sem_undo_list *ulp = NULL;
> +	int i;
> +	int ret = 0;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK_SEM_UNDO_LIST);
> +	if (IS_ERR(h))
> +		return ERR_PTR(PTR_ERR(h));
> +
> +	ulp = alloc_undo_list(current->nsproxy->ipc_ns);
> +	if (!ulp) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	for (i = 0; i<  h->count; i++) {
> +		ret = restore_task_sem_undo_adj(ctx);
> +		if (ret<  0)
> +			goto out;
> +	}
> + out:
> +	ckpt_hdr_put(ctx, h);
> +	if (ret<  0)
> +		return ERR_PTR(ret);
> +	else
> +		return ulp;
> +}
> +
> +int restore_obj_sem_undo(struct ckpt_ctx *ctx, int sem_undo_objref)
> +{
> +	struct sem_undo_list *ulp;
> +
> +	if (!sem_undo_objref)
> +		return 0; /* Task had no undo list */
> +
> +	ulp = ckpt_obj_try_fetch(ctx, sem_undo_objref, CKPT_OBJ_SEM_UNDO);
> +	if (IS_ERR(ulp))
> +		return PTR_ERR(ulp);
> +
> +	/* The first task to restore a shared list should already have this,
> +	 * but subsequent ones won't, so attach to current in that case
> +	 */
> +	if (!current->sysvsem.undo_list)
> +		current->sysvsem.undo_list = ulp;
> +
> +	atomic_inc(&ulp->refcnt);

I suspect there is a leak here: atomic_inc is needed only for
tasks other than the first task; The first task already gets the
refcount deep inside find_alloc_undo(), and the hash-table gets
its ref via the obj->grab method.

[...]

Oren.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] Add support for the per-task sem_undo list (v2)
       [not found]         ` <4C59B625.7000007-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2010-08-04 20:17           ` Dan Smith
       [not found]             ` <87pqxyku9j.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Dan Smith @ 2010-08-04 20:17 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

>> +static void put_undo_list(struct sem_undo_list *ulp);

OL> nit: to me it makes more sense to move
OL> grab/drop/checkpoint/restart code, as well sem_init(), to the end
OL> of the file (and avoid those statics...).

Okay, was trying to avoid chopping the file up too much.

OL> Can get rid of ..._users() since we don't collect them.

Oops :)

OL> semadj is short, so:
OL> s/__u16/__s16/

Gah, yeah, I always want everything to be unsigned for some reason :D

>> -		sma = sem_lock_check(tsk->nsproxy->ipc_ns, un->semid);
>> +		sma = sem_lock_check(ulp->ipc_ns, un->semid);

OL> This hunk belongs to previous patch, no ?

It could, but it seems more relevant when converting the function from
using a task to using just the undo_list itself...

>> +	atomic_inc(&ulp->refcnt);

OL> I suspect there is a leak here: atomic_inc is needed only for
OL> tasks other than the first task; The first task already gets the
OL> refcount deep inside find_alloc_undo(), and the hash-table gets
OL> its ref via the obj->grab method.

But then it drops that ref right after it inserts it, thus we need
another one, right?.  I think the confusion may come from the fact
that the hash assumes the first refcount is its own and the caller of
restore_obj() will grab another (for the task, in this case).  Since
we don't do that, we need to grab both early on.  I hit this in
another place in the network stuff, IIRC, which generated a similar
review comment ;)

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] Add support for the per-task sem_undo list (v2)
       [not found]             ` <87pqxyku9j.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2010-08-04 21:29               ` Oren Laadan
       [not found]                 ` <4C59DBD0.1000107-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Oren Laadan @ 2010-08-04 21:29 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg



On 08/04/2010 04:17 PM, Dan Smith wrote:
>>> +static void put_undo_list(struct sem_undo_list *ulp);
>
> OL>  nit: to me it makes more sense to move
> OL>  grab/drop/checkpoint/restart code, as well sem_init(), to the end
> OL>  of the file (and avoid those statics...).
>
> Okay, was trying to avoid chopping the file up too much.
>
> OL>  Can get rid of ..._users() since we don't collect them.
>
> Oops :)
>
> OL>  semadj is short, so:
> OL>  s/__u16/__s16/
>
> Gah, yeah, I always want everything to be unsigned for some reason :D
>
>>> -		sma = sem_lock_check(tsk->nsproxy->ipc_ns, un->semid);
>>> +		sma = sem_lock_check(ulp->ipc_ns, un->semid);
>
> OL>  This hunk belongs to previous patch, no ?
>
> It could, but it seems more relevant when converting the function from
> using a task to using just the undo_list itself...
>
>>> +	atomic_inc(&ulp->refcnt);
>
> OL>  I suspect there is a leak here: atomic_inc is needed only for
> OL>  tasks other than the first task; The first task already gets the
> OL>  refcount deep inside find_alloc_undo(), and the hash-table gets
> OL>  its ref via the obj->grab method.
>
> But then it drops that ref right after it inserts it, thus we need
> another one, right?.  I think the confusion may come from the fact
> that the hash assumes the first refcount is its own and the caller of
> restore_obj() will grab another (for the task, in this case).  Since
> we don't do that, we need to grab both early on.  I hit this in
> another place in the network stuff, IIRC, which generated a similar
> review comment ;)

True ... but -

In that case, the atomic_inc for the first task should occur
earlier - as soon as it is attached to the task _and_ inserted
into the objhash.

Consider the following scenario:  the task calls find_alloc_undo(),
so the undo_list is attached to the task; then when the restore
succeeds, the undo_list is also in the obj_hash. But one reference
is missing. A malicious user can make restart fail now - e.g. by
corrupting the image file - before the first tasks grabbed the
additional reference :(

So my point is valid if not accurate - the atomic_inc() does not
belong there.

---

And this made me think... I wonder if the following is a security
hazard for us:

In the current code, e.g. restore_file_table(), the first task
that restores a given file-table (or mm) will assume that it has
a "fresh" file-table (or mm) and will modify it on-the-spot.

This works because userspace restart will arrange for restarting
tasks to be like that (only threads will a-priori share mm). If
not, then a later task could "overwrite" the restore work of a
previous task in case they had shared e.g. file-table when the
restart starts.

That in itself is not a concern; but support a malicious user
modifies userspace restart, to have multiple tasks share their
file-table and/or mm. That user could arrange for the image to
force a certain state (file-table, mm, ..) on a previous task
when restoring a later task.

Again, in itself it is not a concern; but what if that previous
task was restored to have more privileges/credentials ?  would
that allow a malicious user to break out ?

Oren.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] Add support for the per-task sem_undo list (v2)
       [not found]                 ` <4C59DBD0.1000107-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2010-08-04 21:38                   ` Dan Smith
       [not found]                     ` <87lj8mkqhk.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Dan Smith @ 2010-08-04 21:38 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

OL> In that case, the atomic_inc for the first task should occur
OL> earlier - as soon as it is attached to the task _and_ inserted
OL> into the objhash.

Okay, fair enough.  By my calculations, it belongs just a few lines
above, in restore_sem_undo() before a successful termination.  Then we
leave the refcount increment in restore_obj_sem_undo(), but only for
second-and-later tasks.  Agreed?

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] Add support for the per-task sem_undo list (v2)
       [not found]                     ` <87lj8mkqhk.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2010-08-04 21:48                       ` Oren Laadan
  2010-08-05 16:17                         ` Dan Smith
  0 siblings, 1 reply; 9+ messages in thread
From: Oren Laadan @ 2010-08-04 21:48 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg



On 08/04/2010 05:38 PM, Dan Smith wrote:
> OL>  In that case, the atomic_inc for the first task should occur
> OL>  earlier - as soon as it is attached to the task _and_ inserted
> OL>  into the objhash.
>
> Okay, fair enough.  By my calculations, it belongs just a few lines
> above, in restore_sem_undo() before a successful termination.  Then we

Yep, in the last "else" clause that returns the @ulp.

> leave the refcount increment in restore_obj_sem_undo(), but only for
> second-and-later tasks.  Agreed?

Deal :)

Oren.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] Add support for the per-task sem_undo list (v2)
  2010-08-04 21:48                       ` Oren Laadan
@ 2010-08-05 16:17                         ` Dan Smith
  0 siblings, 0 replies; 9+ messages in thread
From: Dan Smith @ 2010-08-05 16:17 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

OL> Deal :)

Okay, included below.  I put the refcount for the first task before
the "out:" label to avoid the curly braces on the else clause, which
should be functionally the same.  Hope that's okay :)

Thanks!

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

Add support for the per-task sem_undo list (v3)

The semaphore undo list is a set of adjustments to be made to semaphores
held by a task on exit.  Right now, we do not checkpoint or restore this
list which could cause undesirable behavior by a restarted process on exit.

Changes in v3:
 - Move taking of the refcount for the first process to restore_sem_undo()
   and make restore_obj_sem_undo() take the reference for second-and-
   later tasks
 - Fix uses of __u16 to represent a short
 - Fix potential for overrunning the un->semadj buffer in restore
 - Move the checkpoint object and init functions to the bottom of sem.c

Changes in v2:
 - Remove collect operation
 - Add a BUILD_BUG_ON() to ensure sizeof(short) == sizeof(__u16)
 - Use sizeof(__u16) when copying to/from checkpoint header
 - Fix a couple of leaked hdr objects
 - Avoid reading the semadj buffer with rcu_read_lock() held
 - Set the sem_undo pointer on tasks other than the first to restore a list
 - Fix refcounting on restart
 - Pull out the guts of exit_sem() into put_undo_list() and call that
   from our drop() function in case we're the last one.

Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 4e25042..a11d40e 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -271,6 +271,10 @@ extern int ckpt_collect_fs(struct ckpt_ctx *ctx, struct task_struct *t);
 extern int checkpoint_obj_fs(struct ckpt_ctx *ctx, struct task_struct *t);
 extern int restore_obj_fs(struct ckpt_ctx *ctx, int fs_objref);
 
+/* per-task semaphore undo */
+extern int checkpoint_obj_sem_undo(struct ckpt_ctx *ctx, struct task_struct *t);
+extern int restore_obj_sem_undo(struct ckpt_ctx *ctx, int sem_undo_objref);
+
 /* memory */
 extern void ckpt_pgarr_free(struct ckpt_ctx *ctx);
 
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index f4f9577..b48fa37 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -167,6 +167,10 @@ enum {
 #define CKPT_HDR_IPC_MSG_MSG CKPT_HDR_IPC_MSG_MSG
 	CKPT_HDR_IPC_SEM,
 #define CKPT_HDR_IPC_SEM CKPT_HDR_IPC_SEM
+	CKPT_HDR_TASK_SEM_UNDO_LIST,
+#define CKPT_HDR_TASK_SEM_UNDO_LIST CKPT_HDR_TASK_SEM_UNDO_LIST
+	CKPT_HDR_TASK_SEM_UNDO,
+#define CKPT_HDR_TASK_SEM_UNDO CKPT_HDR_TASK_SEM_UNDO
 
 	CKPT_HDR_SIGHAND = 601,
 #define CKPT_HDR_SIGHAND CKPT_HDR_SIGHAND
@@ -273,6 +277,8 @@ enum obj_type {
 #define CKPT_OBJ_NET_NS CKPT_OBJ_NET_NS
 	CKPT_OBJ_NETDEV,
 #define CKPT_OBJ_NETDEV CKPT_OBJ_NETDEV
+	CKPT_OBJ_SEM_UNDO,
+#define CKPT_OBJ_SEM_UNDO CKPT_OBJ_SEM_UNDO
 	CKPT_OBJ_MAX
 #define CKPT_OBJ_MAX CKPT_OBJ_MAX
 };
@@ -461,6 +467,17 @@ struct ckpt_hdr_ns {
 	__s32 net_objref;
 } __attribute__((aligned(8)));
 
+struct ckpt_hdr_task_sem_undo_list {
+	struct ckpt_hdr h;
+	__u32 count;
+};
+
+struct ckpt_hdr_task_sem_undo {
+	struct ckpt_hdr h;
+	__u32 semid;
+	__u32 semadj_count;
+};
+
 /* cannot include <linux/tty.h> from userspace, so define: */
 #define CKPT_NEW_UTS_LEN  64
 #ifdef __KERNEL__
@@ -487,6 +504,7 @@ struct ckpt_hdr_task_objs {
 	__s32 files_objref;
 	__s32 mm_objref;
 	__s32 fs_objref;
+	__s32 sem_undo_objref;
 	__s32 sighand_objref;
 	__s32 signal_objref;
 } __attribute__((aligned(8)));
diff --git a/ipc/sem.c b/ipc/sem.c
index e439b73..d79b1ee 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -132,14 +132,6 @@ void sem_exit_ns(struct ipc_namespace *ns)
 }
 #endif
 
-void __init sem_init (void)
-{
-	sem_init_ns(&init_ipc_ns);
-	ipc_init_proc_interface("sysvipc/sem",
-				"       key      semid perms      nsems   uid   gid  cuid  cgid      otime      ctime\n",
-				IPC_SEM_IDS, sysvipc_sem_proc_show);
-}
-
 /*
  * sem_lock_(check_) routines are called in the paths where the rw_mutex
  * is not held.
@@ -1363,14 +1355,8 @@ int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
  * The current implementation does not do so. The POSIX standard
  * and SVID should be consulted to determine what behavior is mandated.
  */
-void exit_sem(struct task_struct *tsk)
+static void put_undo_list(struct sem_undo_list *ulp)
 {
-	struct sem_undo_list *ulp;
-
-	ulp = tsk->sysvsem.undo_list;
-	if (!ulp)
-		return;
-	tsk->sysvsem.undo_list = NULL;
 
 	if (!atomic_dec_and_test(&ulp->refcnt))
 		return;
@@ -1393,7 +1379,7 @@ void exit_sem(struct task_struct *tsk)
 		if (semid == -1)
 			break;
 
-		sma = sem_lock_check(tsk->nsproxy->ipc_ns, un->semid);
+		sma = sem_lock_check(ulp->ipc_ns, un->semid);
 
 		/* exit_sem raced with IPC_RMID, nothing to do */
 		if (IS_ERR(sma))
@@ -1451,6 +1437,16 @@ void exit_sem(struct task_struct *tsk)
 	kfree(ulp);
 }
 
+void exit_sem(struct task_struct *tsk)
+{
+	struct sem_undo_list *ulp = tsk->sysvsem.undo_list;
+
+	if (ulp) {
+		put_undo_list(ulp);
+		tsk->sysvsem.undo_list = NULL;
+	}
+}
+
 #ifdef CONFIG_PROC_FS
 static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
 {
@@ -1470,3 +1466,324 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
 			  sma->sem_ctime);
 }
 #endif
+
+static int __get_task_semids(struct sem_undo_list *ulp, int *semids, int max)
+{
+	int count = 0;
+	struct sem_undo *un;
+
+	if (!ulp)
+		return 0;
+
+	spin_lock(&ulp->lock);
+	list_for_each_entry_rcu(un, &ulp->list_proc, list_proc) {
+		if (count >= max) {
+			count = -E2BIG;
+			break;
+		}
+		semids[count++] = un->semid;
+	}
+	spin_unlock(&ulp->lock);
+
+	return count;
+}
+
+static int get_task_semids(struct sem_undo_list *ulp, int **semid_listp)
+{
+	int ret;
+	int max = 32;
+	int *semid_list = NULL;
+ retry:
+	*semid_listp = krealloc(semid_list, max * sizeof(int), GFP_KERNEL);
+	if (!*semid_listp) {
+		kfree(semid_list);
+		return -ENOMEM;
+	}
+	semid_list = *semid_listp;
+
+	ret = __get_task_semids(ulp, semid_list, max);
+	if (ret == -E2BIG) {
+		max *= 2;
+		goto retry;
+	} else if (ret < 0) {
+		kfree(semid_list);
+		*semid_listp = NULL;
+	}
+
+	return ret;
+}
+
+int checkpoint_sem_undo_adj(struct ckpt_ctx *ctx, struct sem_undo *un)
+{
+	int nsems;
+	int ret;
+	short *semadj = NULL;
+	struct sem_array *sma;
+	struct ckpt_hdr_task_sem_undo *h = NULL;
+
+	sma = sem_lock(ctx->root_nsproxy->ipc_ns, un->semid);
+	if (IS_ERR(sma)) {
+		ckpt_debug("unable to lock semid %i (wrong ns?)\n", un->semid);
+		return PTR_ERR(sma);
+	}
+
+	nsems = sma->sem_nsems;
+	sem_getref_and_unlock(sma);
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK_SEM_UNDO);
+	if (!h)
+		goto putref_abort;
+
+	semadj = kzalloc(nsems * sizeof(short), GFP_KERNEL);
+	if (!semadj)
+		goto putref_abort;
+
+	sem_lock_and_putref(sma);
+
+	h->semid = un->semid;
+	h->semadj_count = nsems;
+	memcpy(semadj, un->semadj, h->semadj_count * sizeof(__s16));
+
+	sem_unlock(sma);
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *)h);
+	if (ret == 0)
+		ret = ckpt_write_buffer(ctx, semadj, nsems * sizeof(__s16));
+
+	kfree(semadj);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+
+ putref_abort:
+	sem_putref(sma);
+	if (h)
+		ckpt_hdr_put(ctx, h);
+
+	return -ENOMEM;
+}
+
+int write_sem_undo_list(struct ckpt_ctx *ctx, struct sem_undo_list *ulp,
+			int count, int *semids)
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < count; i++) {
+		struct sem_undo *un;
+
+		spin_lock(&ulp->lock);
+		un = lookup_undo(ulp, semids[i]);
+		spin_unlock(&ulp->lock);
+
+		if (!un) {
+			ckpt_debug("unable to lookup semid %i\n", semids[i]);
+			return -EINVAL;
+		}
+
+		ret = checkpoint_sem_undo_adj(ctx, un);
+		ckpt_debug("checkpoint_sem_undo: %i\n", ret);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int checkpoint_sem_undo(struct ckpt_ctx *ctx, void *ptr)
+{
+	int ret;
+	int *semids = NULL;
+	struct sem_undo_list *ulp = ptr;
+	struct ckpt_hdr_task_sem_undo_list *h;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK_SEM_UNDO_LIST);
+	if (!h)
+		return -ENOMEM;
+
+	ret = get_task_semids(ulp, &semids);
+	if (ret < 0)
+		goto out;
+	h->count = ret;
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *)h);
+	if (ret < 0)
+		goto out;
+
+	ret = write_sem_undo_list(ctx, ulp, h->count, semids);
+ out:
+	ckpt_hdr_put(ctx, h);
+	kfree(semids);
+
+	return ret;
+}
+
+int checkpoint_obj_sem_undo(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct sem_undo_list *ulp;
+
+	ulp = t->sysvsem.undo_list;
+	if (ulp)
+		return checkpoint_obj(ctx, ulp, CKPT_OBJ_SEM_UNDO);
+
+	return 0;
+}
+
+/* Count the number of sems for the given sem_undo->semid */
+static int sem_undo_nsems(struct sem_undo *un, struct ipc_namespace *ns)
+{
+	struct sem_array *sma;
+	int nsems;
+
+	sma = sem_lock(ns, un->semid);
+	if (IS_ERR(sma))
+		return PTR_ERR(sma);
+
+	nsems = sma->sem_nsems;
+	sem_unlock(sma);
+
+	return nsems;
+}
+
+static int restore_task_sem_undo_adj(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_task_sem_undo *h;
+	int len;
+	int ret = -ENOMEM;
+	struct sem_undo *un;
+	int nsems;
+	short *semadj = NULL;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK_SEM_UNDO);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	len = sizeof(__s16) * h->semadj_count;
+	semadj = kzalloc(len, GFP_KERNEL);
+	if (!semadj)
+		goto out;
+
+	ret = _ckpt_read_buffer(ctx, semadj, len);
+	if (ret < 0)
+		goto out;
+
+	un = find_alloc_undo(current->nsproxy->ipc_ns, h->semid);
+	if (IS_ERR(un)) {
+		ret = PTR_ERR(un);
+		ckpt_debug("unable to find semid %i\n", h->semid);
+		goto out;
+	}
+
+	nsems = sem_undo_nsems(un, current->nsproxy->ipc_ns);
+	len = sizeof(__s16) * nsems;
+	if (h->semadj_count == nsems)
+		memcpy(un->semadj, semadj, len);
+	rcu_read_unlock();
+
+	if (nsems != h->semadj_count)
+		ckpt_err(ctx, -EINVAL,
+			 "semid %i has nmsems=%i but %i undo adjustments\n",
+			 h->semid, nsems, h->semadj_count);
+	else
+		ckpt_debug("semid %i restored with %i adjustments\n",
+			   h->semid, h->semadj_count);
+ out:
+	ckpt_hdr_put(ctx, h);
+	kfree(semadj);
+
+	return ret;
+}
+
+static void *restore_sem_undo(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_task_sem_undo_list *h;
+	struct sem_undo_list *ulp = NULL;
+	int i;
+	int ret = 0;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK_SEM_UNDO_LIST);
+	if (IS_ERR(h))
+		return ERR_PTR(PTR_ERR(h));
+
+	ulp = alloc_undo_list(current->nsproxy->ipc_ns);
+	if (!ulp) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < h->count; i++) {
+		ret = restore_task_sem_undo_adj(ctx);
+		if (ret < 0)
+			goto out;
+	}
+
+	atomic_inc(&ulp->refcnt);
+ out:
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	else
+		return ulp;
+}
+
+int restore_obj_sem_undo(struct ckpt_ctx *ctx, int sem_undo_objref)
+{
+	struct sem_undo_list *ulp;
+
+	if (!sem_undo_objref)
+		return 0; /* Task had no undo list */
+
+	ulp = ckpt_obj_try_fetch(ctx, sem_undo_objref, CKPT_OBJ_SEM_UNDO);
+	if (IS_ERR(ulp))
+		return PTR_ERR(ulp);
+
+	/* The first task to restore a shared list should already have this,
+	 * but subsequent ones won't, so attach to current in that case and
+	 * take our reference.
+	 */
+	if (!current->sysvsem.undo_list) {
+		current->sysvsem.undo_list = ulp;
+		atomic_inc(&ulp->refcnt);
+	}
+
+	return 0;
+}
+
+static int obj_sem_undo_grab(void *ptr)
+{
+	struct sem_undo_list *ulp = ptr;
+
+	atomic_inc(&ulp->refcnt);
+	return 0;
+}
+
+static void obj_sem_undo_drop(void *ptr, int lastref)
+{
+	struct sem_undo_list *ulp = ptr;
+
+	put_undo_list(ulp);
+}
+
+static const struct ckpt_obj_ops ckpt_obj_sem_undo_ops = {
+	.obj_name = "IPC_SEM_UNDO",
+	.obj_type = CKPT_OBJ_SEM_UNDO,
+	.ref_drop = obj_sem_undo_drop,
+	.ref_grab = obj_sem_undo_grab,
+	.checkpoint = checkpoint_sem_undo,
+	.restore = restore_sem_undo,
+};
+
+void __init sem_init (void)
+{
+	sem_init_ns(&init_ipc_ns);
+	ipc_init_proc_interface("sysvipc/sem",
+				"       key      semid perms      nsems   uid   gid  cuid  cgid      otime      ctime\n",
+				IPC_SEM_IDS, sysvipc_sem_proc_show);
+
+	/* sem_undo_list          uses a short
+	*  ckpt_hdr_task_sem_undo uses a __s16
+	*/
+	BUILD_BUG_ON(sizeof(short) != sizeof(__s16));
+
+	register_checkpoint_obj(&ckpt_obj_sem_undo_ops);
+}
diff --git a/kernel/checkpoint/process.c b/kernel/checkpoint/process.c
index 936675a..4ec9cdd 100644
--- a/kernel/checkpoint/process.c
+++ b/kernel/checkpoint/process.c
@@ -236,6 +236,7 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
 	int files_objref;
 	int mm_objref;
 	int fs_objref;
+	int sem_undo_objref;
 	int sighand_objref;
 	int signal_objref;
 	int first, ret;
@@ -283,6 +284,12 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
 		return fs_objref;
 	}
 
+	sem_undo_objref = checkpoint_obj_sem_undo(ctx, t);
+	if (sem_undo_objref < 0) {
+		ckpt_err(ctx, sem_undo_objref, "%(T)process sem_undo\n");
+		return sem_undo_objref;
+	}
+
 	sighand_objref = checkpoint_obj_sighand(ctx, t);
 	ckpt_debug("sighand: objref %d\n", sighand_objref);
 	if (sighand_objref < 0) {
@@ -311,6 +318,7 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
 	h->files_objref = files_objref;
 	h->mm_objref = mm_objref;
 	h->fs_objref = fs_objref;
+	h->sem_undo_objref = sem_undo_objref;
 	h->sighand_objref = sighand_objref;
 	h->signal_objref = signal_objref;
 	ret = ckpt_write_obj(ctx, &h->h);
@@ -679,6 +687,11 @@ static int restore_task_objs(struct ckpt_ctx *ctx)
 	if (ret < 0)
 		return ret;
 
+	ret = restore_obj_sem_undo(ctx, h->sem_undo_objref);
+	ckpt_debug("sem_undo: ret %d\n", ret);
+	if (ret < 0)
+		return ret;
+
 	ret = restore_obj_sighand(ctx, h->sighand_objref);
 	ckpt_debug("sighand: ret %d (%p)\n", ret, current->sighand);
 	if (ret < 0)

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2010-08-05 16:17 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-08-04 17:02 Support for SEM_UNDO, round 2 Dan Smith
     [not found] ` <1280941345-27566-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-08-04 17:02   ` [PATCH 1/2] Add ipc_namespace to struct sem_undo_list Dan Smith
2010-08-04 17:02   ` [PATCH 2/2] Add support for the per-task sem_undo list (v2) Dan Smith
     [not found]     ` <1280941345-27566-3-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-08-04 18:49       ` Oren Laadan
     [not found]         ` <4C59B625.7000007-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2010-08-04 20:17           ` Dan Smith
     [not found]             ` <87pqxyku9j.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2010-08-04 21:29               ` Oren Laadan
     [not found]                 ` <4C59DBD0.1000107-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2010-08-04 21:38                   ` Dan Smith
     [not found]                     ` <87lj8mkqhk.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2010-08-04 21:48                       ` Oren Laadan
2010-08-05 16:17                         ` Dan Smith

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.