All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH (draft)] Change task_struct->comm to use RCU.
       [not found]       ` <201402092327.JAD12489.QOLSFVMHJtFOOF@I-love.SAKURA.ne.jp>
@ 2014-02-10 13:43         ` Tetsuo Handa
  2014-02-17 11:27           ` [PATCH] " Tetsuo Handa
  0 siblings, 1 reply; 20+ messages in thread
From: Tetsuo Handa @ 2014-02-10 13:43 UTC (permalink / raw)
  To: akpm; +Cc: joe, keescook, geert, jkosina, viro, davem, linux-kernel

Hello.

Tetsuo Handa wrote:
> I think there is some point in applying the preparatory patches (commcpy() and
> %pT patches) now, for they fix time-to-call-strlen()-to-time-to-call-memcpy()
> problem by making sure that the comm string passed to strlen() etc. is also
> passed to memcpy() etc. by taking a snapshot of the comm string.
> 
> What the preparatory patches cannot fix is that the comm string is modified
> while taking a snapshot of the comm string. The RCU work will fix it.
> 
> Anyway, you want to see the RCU work before you merge commcpy() and %pT
> patches. OK. I'll start making the RCU work.

This is a draft patch which changes task_struct->comm to use RCU.

This patch currently does not build, for this patch can be applied only after
commcpy() and %pT patches are merged and all direct ->comm users are killed.

This patch sleeps when kmalloc() fails when changing task_struct->comm. But are
we allowed to sleep when changing task_struct->comm ? If yes, there is no need
to rename set_task_comm() to commset() in this patch. If no, I need to find a
different answer.

Regards.
----------
>From c1d907a109a5407c7eaf0e81741f99b4715ba55d Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 10 Feb 2014 19:25:51 +0900
Subject: [PATCH (draft)] Change task_struct->comm to use RCU.

This patch guarantees that the task_struct->comm is updated atomically.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 fs/exec.c                 |   94 ++++++++++++++++++++++++++++++++++++++------
 fs/proc/base.c            |    2 +-
 include/linux/init_task.h |    4 +-
 include/linux/sched.h     |   35 ++++++++++++++---
 kernel/fork.c             |   11 +++++
 kernel/kthread.c          |    6 ++-
 kernel/sched/core.c       |    7 +++-
 kernel/sys.c              |    2 +-
 8 files changed, 135 insertions(+), 26 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 69c6089..d71fd63 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -263,6 +263,11 @@ extern char ___assert_task_state[1 - 2*!!(
 
 /* Task command name length */
 #define TASK_COMM_LEN 16
+struct rcu_comm {
+	char name[TASK_COMM_LEN];
+	struct rcu_head rcu;
+	bool is_static;
+};
 
 #include <linux/spinlock.h>
 
@@ -1317,10 +1322,11 @@ struct task_struct {
 					 * credentials (COW) */
 	const struct cred __rcu *cred;	/* effective (overridable) subjective task
 					 * credentials (COW) */
-	char comm[TASK_COMM_LEN]; /* executable name excluding path
-				     - access with [gs]et_task_comm (which lock
-				       it with task_lock())
-				     - initialized normally by setup_new_exec */
+	struct rcu_comm __rcu *rcu_comm; /* executable name excluding path
+					    - update with commset()
+					    - read with commcpy() or %pT
+					    - initialized normally by
+					      setup_new_exec */
 /* file system info */
 	int link_count, total_link_count;
 #ifdef CONFIG_SYSVIPC
@@ -1805,6 +1811,23 @@ static inline cputime_t task_gtime(struct task_struct *t)
 extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
 extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
+/**
+ * commcpy - Copy task_struct->comm to buffer.
+ *
+ * @buf: Buffer to copy @tsk->comm which must be at least TASK_COMM_LEN bytes.
+ * @tsk: Pointer to "struct task_struct".
+ *
+ * Returns @buf .
+ */
+static inline char *commcpy(char *buf, const struct task_struct *tsk)
+{
+	rcu_read_lock();
+	memcpy(buf, rcu_dereference(p->rcu_comm)->name, TASK_COMM_LEN);
+	rcu_read_unlock();
+	buf[TASK_COMM_LEN - 1] = '\0';
+	return buf;
+}
+
 /*
  * Per process flags
  */
@@ -2332,8 +2355,8 @@ extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, i
 struct task_struct *fork_idle(int);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
 
-extern void set_task_comm(struct task_struct *tsk, char *from);
-extern char *get_task_comm(char *to, struct task_struct *tsk);
+void do_commset(struct task_struct *tsk, const char *buf);
+void commset(struct task_struct *tsk, const char *buf);
 
 #ifdef CONFIG_SMP
 void scheduler_ipi(void);
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 6df7f9f..b217f8c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -154,7 +154,7 @@ extern struct task_group root_task_group;
 # define INIT_VTIME(tsk)
 #endif
 
-#define INIT_TASK_COMM "swapper"
+extern struct rcu_comm init_rcu_comm;
 
 #ifdef CONFIG_RT_MUTEXES
 # define INIT_RT_MUTEXES(tsk)						\
@@ -201,7 +201,7 @@ extern struct task_group root_task_group;
 	.group_leader	= &tsk,						\
 	RCU_POINTER_INITIALIZER(real_cred, &init_cred),			\
 	RCU_POINTER_INITIALIZER(cred, &init_cred),			\
-	.comm		= INIT_TASK_COMM,				\
+	.rcu_comm       = &init_rcu_comm,				\
 	.thread		= INIT_THREAD,					\
 	.fs		= &init_fs,					\
 	.files		= &init_files,					\
diff --git a/fs/exec.c b/fs/exec.c
index 3d78fcc..8c2b1c3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -67,6 +67,9 @@
 
 int suid_dumpable = 0;
 
+/* comm name for init_task. */
+struct rcu_comm init_rcu_comm = { .name = "swapper", .is_static = 1 };
+
 static LIST_HEAD(formats);
 static DEFINE_RWLOCK(binfmt_lock);
 
@@ -1026,27 +1029,92 @@ killed:
 	return -EAGAIN;
 }
 
-char *get_task_comm(char *buf, struct task_struct *tsk)
-{
-	/* buf must be at least sizeof(tsk->comm) in size */
-	task_lock(tsk);
-	strncpy(buf, tsk->comm, sizeof(tsk->comm));
-	task_unlock(tsk);
-	return buf;
-}
-EXPORT_SYMBOL_GPL(get_task_comm);
-
 /*
  * These functions flushes out all traces of the currently running executable
  * so that a new one can be started
  */
 
-void set_task_comm(struct task_struct *tsk, char *buf)
+/**
+ * do_commset - Change task_struct->comm.
+ *
+ * @tsk: Pointer to "struct task_struct".
+ * @buf: New comm name.
+ *
+ * Returns nothing.
+ *
+ * This function will sleep if kmalloc() failed.
+ *
+ * Question: Are we allowed to sleep?
+ */
+void do_commset(struct task_struct *tsk, const char *buf)
+{
+	struct rcu_comm *new;
+	struct rcu_comm *old;
+	struct rcu_comm tmp;
+
+	new = kzalloc(sizeof(*new), GFP_KERNEL | __GFP_NOWARN);
+	if (likely(new)) {
+		strncpy(new->name, buf, TASK_COMM_LEN - 1);
+		/* No need to wait because the "new" is kmalloc()ed. */
+		smp_wmb();
+		old = xchg(tsk->rcu_comm, new);
+	} else {
+		/* Memory allocation failed. Use slowpath. */
+		static DEFINE_MUTEX(lock);
+
+		memset(&tmp, 0, sizeof(tmp));
+		tmp.is_static = 1;
+		strncpy(tmp.name, buf, TASK_COMM_LEN - 1);
+		mutex_lock(&lock);
+		/*
+		 * Publish the "tmp" rcu_comm.
+		 * The mutex_lock() above guarantees that the "new" obtained by
+		 * the xchg() below refers either kmemdup()ed rcu_comm as of
+		 * copy_process() or kmalloc()ed rcu_comm as of commset().
+		 */
+		smp_wmb();
+		new = xchg(tsk->rcu_comm, &tmp);
+		/* Wait for readers of the "new" rcu_comm. */
+		synchronize_rcu();
+		/* Reuse the "new" rcu_comm. */
+		memcpy(new->name, tmp.name, TASK_COMM_LEN);
+		/* Publish the "new" rcu_comm. */
+		smp_wmb();
+		old = xchg(tsk->rcu_comm, new);
+		/*
+		 * Wait for readers of the "old" rcu_comm. Note that
+		 * old == &tmp will be false if kmalloc() above by somebody
+		 * else succeeded. Therefore, use the .is_static flag to
+		 * determine whether to kfree() or not.
+		 */
+		synchronize_rcu();
+		mutex_unlock(&lock);
+	}
+	if (!old->is_static)
+		kfree_rcu(old, rcu);
+}
+
+/**
+ * commset - Change task_struct->comm.
+ *
+ * @tsk: Pointer to "struct task_struct".
+ * @buf: New comm name.
+ *
+ * Returns nothing.
+ *
+ * This function will sleep if kmalloc() failed.
+ *
+ * Question: Are we allowed to sleep?
+ */
+void commset(struct task_struct *tsk, const char *buf)
 {
+	/*
+	 * Question: Do we need to use task_lock()/task_unlock() ?
+	 */
 	task_lock(tsk);
 	trace_task_rename(tsk, buf);
-	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
 	task_unlock(tsk);
+	do_commset(tsk, buf);
 	perf_event_comm(tsk);
 }
 
@@ -1122,7 +1190,7 @@ void setup_new_exec(struct linux_binprm * bprm)
 	else
 		set_dumpable(current->mm, suid_dumpable);
 
-	set_task_comm(current, bprm->tcomm);
+	commset(current, bprm->tcomm);
 
 	/* Set the new mm task size. We have to do that late because it may
 	 * depend on TIF_32BIT which is only updated in flush_thread() on
diff --git a/kernel/fork.c b/kernel/fork.c
index a17621c..5ce989d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -214,6 +214,12 @@ void free_task(struct task_struct *tsk)
 	ftrace_graph_exit_task(tsk);
 	put_seccomp_filter(tsk);
 	arch_release_task_struct(tsk);
+	/*
+	 * It is safe to unconditionally kfree() immediately because this
+	 * function must not be called if somebody is doing do_commset(tsk),
+	 * and tsk->rcu_comm != &init_rcu_comm because tsk != &init_task .
+	 */
+	kfree(tsk->rcu_comm);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -1191,6 +1197,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p = dup_task_struct(current);
 	if (!p)
 		goto fork_out;
+	/* This is safe because p is not visible yet. */
+	p->rcu_comm = kmemdup(current->rcu_comm, sizeof(current->rcu_comm),
+			      GFP_KERNEL);
+	if (!p->rcu_comm)
+		goto bad_fork_free;
 
 	ftrace_graph_init_task(p);
 	get_seccomp_filter(p);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2c355bf..f73d6e0 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -318,10 +318,12 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 	if (!IS_ERR(task)) {
 		static const struct sched_param param = { .sched_priority = 0 };
 		va_list args;
+		char name[TASK_COMM_LEN];
 
 		va_start(args, namefmt);
-		vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
+		vsnprintf(name, TASK_COMM_LEN, namefmt, args);
 		va_end(args);
+		do_commset(task, name);
 		/*
 		 * root may have changed our (kthreadd's) priority or CPU mask.
 		 * The kernel thread should not inherit these properties.
@@ -494,7 +496,7 @@ int kthreadd(void *unused)
 	struct task_struct *tsk = current;
 
 	/* Setup a clean context for our children to inherit. */
-	set_task_comm(tsk, "kthreadd");
+	commset(tsk, "kthreadd");
 	ignore_signals(tsk);
 	set_cpus_allowed_ptr(tsk, cpu_all_mask);
 	set_mems_allowed(node_states[N_MEMORY]);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index feb6630..81f7b16 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4490,7 +4490,12 @@ void init_idle(struct task_struct *idle, int cpu)
 	ftrace_graph_init_idle_task(idle, cpu);
 	vtime_init_idle(idle, cpu);
 #if defined(CONFIG_SMP)
-	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+	{
+		char name[TASK_COMM_LEN];
+
+		snprintf(name, TASK_COMM_LEN, "swapper/%d", cpu);
+		do_commset(idle, name);
+	}
 #endif
 }
 
diff --git a/kernel/sys.c b/kernel/sys.c
index c0a58be..d6dcaee 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1897,7 +1897,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		if (strncpy_from_user(comm, (char __user *)arg2,
 				      sizeof(me->comm) - 1) < 0)
 			return -EFAULT;
-		set_task_comm(me, comm);
+		commset(me, comm);
 		proc_comm_connector(me);
 		break;
 	case PR_GET_NAME:
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5150706..064f6c3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1396,7 +1396,7 @@ static ssize_t comm_write(struct file *file, const char __user *buf,
 		return -ESRCH;
 
 	if (same_thread_group(current, p))
-		set_task_comm(p, buffer);
+		commset(p, buffer);
 	else
 		count = -EINVAL;
 
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] Change task_struct->comm to use RCU.
  2014-02-10 13:43         ` [PATCH (draft)] Change task_struct->comm to use RCU Tetsuo Handa
@ 2014-02-17 11:27           ` Tetsuo Handa
  2014-02-24 23:51             ` Paul E. McKenney
  2014-02-25  1:49             ` Lai Jiangshan
  0 siblings, 2 replies; 20+ messages in thread
From: Tetsuo Handa @ 2014-02-17 11:27 UTC (permalink / raw)
  To: akpm; +Cc: joe, keescook, geert, jkosina, viro, davem, linux-kernel

Tetsuo Handa wrote:
> This is a draft patch which changes task_struct->comm to use RCU.

Changes from previous draft version:

  Changed "struct rcu_comm" to use copy-on-write approach. Those multi-thread
  or multi-process applications which do not change comm name will consume
  memory for only one "struct rcu_comm".

  Changed do_commset() not to sleep.

  Changed to tolerate loss of consistency when memory allocation failed.

  Fixed race condition in copy_process().

Regards.
----------
>From ada6c4d94f5afda36c7c21869d38b7111a6fe9bc Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 17 Feb 2014 14:32:11 +0900
Subject: [PATCH] Change task_struct->comm to use RCU.

This patch changes task_struct->comm to be updated using RCU
(unless memory allocation fails).

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 fs/exec.c                 |  145 +++++++++++++++++++++++++++++++++++++++------
 include/linux/init_task.h |    4 +-
 include/linux/sched.h     |   37 ++++++++++--
 kernel/fork.c             |    9 +++
 kernel/kthread.c          |    4 +-
 kernel/sched/core.c       |    2 +-
 6 files changed, 173 insertions(+), 28 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a781dec..8a68ab3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -263,6 +263,11 @@ extern char ___assert_task_state[1 - 2*!!(
 
 /* Task command name length */
 #define TASK_COMM_LEN 16
+struct rcu_comm {
+	char name[TASK_COMM_LEN];
+	atomic_t usage;
+	struct rcu_head rcu;
+};
 
 #include <linux/spinlock.h>
 
@@ -1317,10 +1322,12 @@ struct task_struct {
 					 * credentials (COW) */
 	const struct cred __rcu *cred;	/* effective (overridable) subjective task
 					 * credentials (COW) */
-	char comm[TASK_COMM_LEN]; /* executable name excluding path
-				     - access with [gs]et_task_comm (which lock
-				       it with task_lock())
-				     - initialized normally by setup_new_exec */
+	struct rcu_comm __rcu *rcu_comm; /* executable name excluding path (COW)
+					    - update with set_task_comm() or
+					      do_set_task_comm[_fmt]()
+					    - read with commcpy() or %pT
+					    - initialized normally by
+					      setup_new_exec */
 /* file system info */
 	int link_count, total_link_count;
 #ifdef CONFIG_SYSVIPC
@@ -1792,6 +1799,23 @@ static inline cputime_t task_gtime(struct task_struct *t)
 extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
 extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
+/**
+ * commcpy - Copy task_struct->comm to buffer.
+ *
+ * @buf: Buffer to copy @tsk->comm which must be at least TASK_COMM_LEN bytes.
+ * @tsk: Pointer to "struct task_struct".
+ *
+ * Returns @buf .
+ */
+static inline char *commcpy(char *buf, const struct task_struct *tsk)
+{
+	rcu_read_lock();
+	memcpy(buf, rcu_dereference(tsk->rcu_comm)->name, TASK_COMM_LEN);
+	rcu_read_unlock();
+	buf[TASK_COMM_LEN - 1] = '\0';
+	return buf;
+}
+
 /*
  * Per process flags
  */
@@ -2320,7 +2344,10 @@ struct task_struct *fork_idle(int);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
 
 extern void set_task_comm(struct task_struct *tsk, char *from);
-extern char *get_task_comm(char *to, struct task_struct *tsk);
+extern void do_set_task_comm(struct task_struct *tsk, const char *buf);
+extern void do_set_task_comm_fmt(struct task_struct *tsk, const char *fmt, ...)
+	__printf(2, 3);
+extern void put_commname(struct rcu_comm *comm);
 
 #ifdef CONFIG_SMP
 void scheduler_ipi(void);
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 6df7f9f..b217f8c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -154,7 +154,7 @@ extern struct task_group root_task_group;
 # define INIT_VTIME(tsk)
 #endif
 
-#define INIT_TASK_COMM "swapper"
+extern struct rcu_comm init_rcu_comm;
 
 #ifdef CONFIG_RT_MUTEXES
 # define INIT_RT_MUTEXES(tsk)						\
@@ -201,7 +201,7 @@ extern struct task_group root_task_group;
 	.group_leader	= &tsk,						\
 	RCU_POINTER_INITIALIZER(real_cred, &init_cred),			\
 	RCU_POINTER_INITIALIZER(cred, &init_cred),			\
-	.comm		= INIT_TASK_COMM,				\
+	.rcu_comm       = &init_rcu_comm,				\
 	.thread		= INIT_THREAD,					\
 	.fs		= &init_fs,					\
 	.files		= &init_files,					\
diff --git a/fs/exec.c b/fs/exec.c
index 3d78fcc..85c57a6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1026,30 +1026,11 @@ killed:
 	return -EAGAIN;
 }
 
-char *get_task_comm(char *buf, struct task_struct *tsk)
-{
-	/* buf must be at least sizeof(tsk->comm) in size */
-	task_lock(tsk);
-	strncpy(buf, tsk->comm, sizeof(tsk->comm));
-	task_unlock(tsk);
-	return buf;
-}
-EXPORT_SYMBOL_GPL(get_task_comm);
-
 /*
  * These functions flushes out all traces of the currently running executable
  * so that a new one can be started
  */
 
-void set_task_comm(struct task_struct *tsk, char *buf)
-{
-	task_lock(tsk);
-	trace_task_rename(tsk, buf);
-	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
-	task_unlock(tsk);
-	perf_event_comm(tsk);
-}
-
 static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len)
 {
 	int i, ch;
@@ -1626,3 +1607,129 @@ asmlinkage long compat_sys_execve(const char __user * filename,
 	return compat_do_execve(getname(filename), argv, envp);
 }
 #endif
+
+/* Task's comm name handler functions. */
+static struct kmem_cache *commname_cachep __read_mostly;
+
+/* comm name for init_task. */
+struct rcu_comm init_rcu_comm = {
+	.name = "swapper",
+	.usage = ATOMIC_INIT(INT_MAX) /* Mark this object static. */
+};
+
+/**
+ * set_task_comm - Change task_struct->comm with tracer and perf hooks called.
+ *
+ * @tsk: Pointer to "struct task_struct".
+ * @buf: New comm name.
+ *
+ * Returns nothing.
+ */
+void set_task_comm(struct task_struct *tsk, char *buf)
+{
+	/*
+	 * Question: Do we need to use task_lock()/task_unlock() ?
+	 */
+	task_lock(tsk);
+	trace_task_rename(tsk, buf);
+	task_unlock(tsk);
+	do_set_task_comm(tsk, buf);
+	perf_event_comm(tsk);
+}
+
+/**
+ * do_set_task_comm_fmt - Change task_struct->comm.
+ *
+ * @tsk: Pointer to "struct task_struct".
+ * @fmt: Format string, followed by arguments.
+ *
+ * Returns nothing.
+ */
+void do_set_task_comm_fmt(struct task_struct *tsk, const char *fmt, ...)
+{
+	va_list args;
+	char name[TASK_COMM_LEN];
+
+	va_start(args, fmt);
+	vsnprintf(name, TASK_COMM_LEN, fmt, args);
+	va_end(args);
+	do_set_task_comm(tsk, name);
+}
+
+/**
+ * do_set_task_comm - Change task_struct->comm.
+ *
+ * @tsk: Pointer to "struct task_struct".
+ * @buf: New comm name.
+ *
+ * Returns nothing.
+ */
+void do_set_task_comm(struct task_struct *tsk, const char *buf)
+{
+	struct rcu_comm *comm;
+
+	comm = kmem_cache_zalloc(commname_cachep, GFP_ATOMIC | __GFP_NOWARN);
+	if (likely(comm)) {
+		atomic_set(&comm->usage, 1);
+		strncpy(comm->name, buf, TASK_COMM_LEN - 1);
+		smp_wmb(); /* Behave like rcu_assign_pointer(). */
+		comm = xchg(&tsk->rcu_comm, comm);
+		put_commname(comm);
+	} else {
+		/*
+		 * Memory allocation failed. We have to tolerate loss of
+		 * consistency.
+		 *
+		 * Question 1: Do we want to reserve some amount of static
+		 * "struct rcu_comm" arrays for use upon allocation failures?
+		 *
+		 * Question 2: Do we perfer unchanged comm name over
+		 * overwritten comm name because comm name is copy-on-write ?
+		 */
+		WARN_ONCE(1, "Failed to allocate memory for comm name.\n");
+		rcu_read_lock();
+		do {
+			comm = rcu_dereference(tsk->rcu_comm);
+		} while (!atomic_read(&comm->usage));
+		strncpy(comm->name, buf, TASK_COMM_LEN - 1);
+		rcu_read_unlock();
+	}
+}
+
+/**
+ * free_commname - Release "struct rcu_comm".
+ *
+ * @rcu: Pointer to "struct rcu_head".
+ *
+ * Returns nothing.
+ */
+static void free_commname(struct rcu_head *rcu)
+{
+	struct rcu_comm *comm = container_of(rcu, struct rcu_comm, rcu);
+	kmem_cache_free(commname_cachep, comm);
+}
+
+/**
+ * put_commname - Release "struct rcu_comm".
+ *
+ * @comm: Pointer to "struct rcu_comm".
+ *
+ * Returns nothing.
+ */
+void put_commname(struct rcu_comm *comm)
+{
+	if (atomic_dec_and_test(&comm->usage))
+		call_rcu(&comm->rcu, free_commname);
+}
+
+/**
+ * commname_init - Initialize "struct rcu_comm".
+ *
+ * Returns nothing.
+ */
+void __init commname_init(void)
+{
+	commname_cachep = kmem_cache_create("commname",
+					    sizeof(struct rcu_comm),
+					    0, SLAB_PANIC, NULL);
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index a17621c..23be1ef 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -214,6 +214,7 @@ void free_task(struct task_struct *tsk)
 	ftrace_graph_exit_task(tsk);
 	put_seccomp_filter(tsk);
 	arch_release_task_struct(tsk);
+	put_commname(tsk->rcu_comm);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -249,6 +250,8 @@ EXPORT_SYMBOL_GPL(__put_task_struct);
 
 void __init __weak arch_task_cache_init(void) { }
 
+extern void __init commname_init(void);
+
 void __init fork_init(unsigned long mempages)
 {
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
@@ -260,6 +263,7 @@ void __init fork_init(unsigned long mempages)
 		kmem_cache_create("task_struct", sizeof(struct task_struct),
 			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
+	commname_init();
 
 	/* do the arch specific task caches init */
 	arch_task_cache_init();
@@ -1191,6 +1195,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p = dup_task_struct(current);
 	if (!p)
 		goto fork_out;
+	rcu_read_lock();
+	do {
+		p->rcu_comm = rcu_dereference(current->rcu_comm);
+	} while (!atomic_inc_not_zero(&p->rcu_comm->usage));
+	rcu_read_unlock();
 
 	ftrace_graph_init_task(p);
 	get_seccomp_filter(p);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b5ae3ee..e83b67c 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -309,10 +309,12 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 	if (!IS_ERR(task)) {
 		static const struct sched_param param = { .sched_priority = 0 };
 		va_list args;
+		char name[TASK_COMM_LEN];
 
 		va_start(args, namefmt);
-		vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
+		vsnprintf(name, TASK_COMM_LEN, namefmt, args);
 		va_end(args);
+		do_set_task_comm(task, name);
 		/*
 		 * root may have changed our (kthreadd's) priority or CPU mask.
 		 * The kernel thread should not inherit these properties.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131e..bbb7c94 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4488,7 +4488,7 @@ void init_idle(struct task_struct *idle, int cpu)
 	ftrace_graph_init_idle_task(idle, cpu);
 	vtime_init_idle(idle, cpu);
 #if defined(CONFIG_SMP)
-	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+	do_set_task_comm_fmt(idle, "swapper/%d", cpu);
 #endif
 }
 
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] Change task_struct->comm to use RCU.
  2014-02-17 11:27           ` [PATCH] " Tetsuo Handa
@ 2014-02-24 23:51             ` Paul E. McKenney
  2014-02-26 13:44               ` Tetsuo Handa
  2014-02-25  1:49             ` Lai Jiangshan
  1 sibling, 1 reply; 20+ messages in thread
From: Paul E. McKenney @ 2014-02-24 23:51 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: akpm, joe, keescook, geert, jkosina, viro, davem, linux-kernel

On Mon, Feb 17, 2014 at 08:27:31PM +0900, Tetsuo Handa wrote:
> Tetsuo Handa wrote:
> > This is a draft patch which changes task_struct->comm to use RCU.
> 
> Changes from previous draft version:
> 
>   Changed "struct rcu_comm" to use copy-on-write approach. Those multi-thread
>   or multi-process applications which do not change comm name will consume
>   memory for only one "struct rcu_comm".
> 
>   Changed do_commset() not to sleep.
> 
>   Changed to tolerate loss of consistency when memory allocation failed.
> 
>   Fixed race condition in copy_process().
> 
> Regards.
> ----------
> >From ada6c4d94f5afda36c7c21869d38b7111a6fe9bc Mon Sep 17 00:00:00 2001
> From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Date: Mon, 17 Feb 2014 14:32:11 +0900
> Subject: [PATCH] Change task_struct->comm to use RCU.
> 
> This patch changes task_struct->comm to be updated using RCU
> (unless memory allocation fails).
> 
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>

A few questions and comments below.  With those addressed,

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

> ---
>  fs/exec.c                 |  145 +++++++++++++++++++++++++++++++++++++++------
>  include/linux/init_task.h |    4 +-
>  include/linux/sched.h     |   37 ++++++++++--
>  kernel/fork.c             |    9 +++
>  kernel/kthread.c          |    4 +-
>  kernel/sched/core.c       |    2 +-
>  6 files changed, 173 insertions(+), 28 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a781dec..8a68ab3 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -263,6 +263,11 @@ extern char ___assert_task_state[1 - 2*!!(
> 
>  /* Task command name length */
>  #define TASK_COMM_LEN 16
> +struct rcu_comm {
> +	char name[TASK_COMM_LEN];
> +	atomic_t usage;
> +	struct rcu_head rcu;
> +};
> 
>  #include <linux/spinlock.h>
> 
> @@ -1317,10 +1322,12 @@ struct task_struct {
>  					 * credentials (COW) */
>  	const struct cred __rcu *cred;	/* effective (overridable) subjective task
>  					 * credentials (COW) */
> -	char comm[TASK_COMM_LEN]; /* executable name excluding path
> -				     - access with [gs]et_task_comm (which lock
> -				       it with task_lock())
> -				     - initialized normally by setup_new_exec */
> +	struct rcu_comm __rcu *rcu_comm; /* executable name excluding path (COW)
> +					    - update with set_task_comm() or
> +					      do_set_task_comm[_fmt]()
> +					    - read with commcpy() or %pT
> +					    - initialized normally by
> +					      setup_new_exec */
>  /* file system info */
>  	int link_count, total_link_count;
>  #ifdef CONFIG_SYSVIPC
> @@ -1792,6 +1799,23 @@ static inline cputime_t task_gtime(struct task_struct *t)
>  extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
>  extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
> 
> +/**
> + * commcpy - Copy task_struct->comm to buffer.
> + *
> + * @buf: Buffer to copy @tsk->comm which must be at least TASK_COMM_LEN bytes.
> + * @tsk: Pointer to "struct task_struct".
> + *
> + * Returns @buf .
> + */
> +static inline char *commcpy(char *buf, const struct task_struct *tsk)
> +{
> +	rcu_read_lock();
> +	memcpy(buf, rcu_dereference(tsk->rcu_comm)->name, TASK_COMM_LEN);
> +	rcu_read_unlock();
> +	buf[TASK_COMM_LEN - 1] = '\0';
> +	return buf;
> +}
> +
>  /*
>   * Per process flags
>   */
> @@ -2320,7 +2344,10 @@ struct task_struct *fork_idle(int);
>  extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
> 
>  extern void set_task_comm(struct task_struct *tsk, char *from);
> -extern char *get_task_comm(char *to, struct task_struct *tsk);
> +extern void do_set_task_comm(struct task_struct *tsk, const char *buf);
> +extern void do_set_task_comm_fmt(struct task_struct *tsk, const char *fmt, ...)
> +	__printf(2, 3);
> +extern void put_commname(struct rcu_comm *comm);
> 
>  #ifdef CONFIG_SMP
>  void scheduler_ipi(void);
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index 6df7f9f..b217f8c 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -154,7 +154,7 @@ extern struct task_group root_task_group;
>  # define INIT_VTIME(tsk)
>  #endif
> 
> -#define INIT_TASK_COMM "swapper"
> +extern struct rcu_comm init_rcu_comm;
> 
>  #ifdef CONFIG_RT_MUTEXES
>  # define INIT_RT_MUTEXES(tsk)						\
> @@ -201,7 +201,7 @@ extern struct task_group root_task_group;
>  	.group_leader	= &tsk,						\
>  	RCU_POINTER_INITIALIZER(real_cred, &init_cred),			\
>  	RCU_POINTER_INITIALIZER(cred, &init_cred),			\
> -	.comm		= INIT_TASK_COMM,				\
> +	.rcu_comm       = &init_rcu_comm,				\
>  	.thread		= INIT_THREAD,					\
>  	.fs		= &init_fs,					\
>  	.files		= &init_files,					\
> diff --git a/fs/exec.c b/fs/exec.c
> index 3d78fcc..85c57a6 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1026,30 +1026,11 @@ killed:
>  	return -EAGAIN;
>  }
> 
> -char *get_task_comm(char *buf, struct task_struct *tsk)
> -{
> -	/* buf must be at least sizeof(tsk->comm) in size */
> -	task_lock(tsk);
> -	strncpy(buf, tsk->comm, sizeof(tsk->comm));
> -	task_unlock(tsk);
> -	return buf;
> -}
> -EXPORT_SYMBOL_GPL(get_task_comm);
> -
>  /*
>   * These functions flushes out all traces of the currently running executable
>   * so that a new one can be started
>   */
> 
> -void set_task_comm(struct task_struct *tsk, char *buf)
> -{
> -	task_lock(tsk);
> -	trace_task_rename(tsk, buf);
> -	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
> -	task_unlock(tsk);
> -	perf_event_comm(tsk);
> -}
> -
>  static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len)
>  {
>  	int i, ch;
> @@ -1626,3 +1607,129 @@ asmlinkage long compat_sys_execve(const char __user * filename,
>  	return compat_do_execve(getname(filename), argv, envp);
>  }
>  #endif
> +
> +/* Task's comm name handler functions. */
> +static struct kmem_cache *commname_cachep __read_mostly;
> +
> +/* comm name for init_task. */
> +struct rcu_comm init_rcu_comm = {
> +	.name = "swapper",
> +	.usage = ATOMIC_INIT(INT_MAX) /* Mark this object static. */
> +};
> +
> +/**
> + * set_task_comm - Change task_struct->comm with tracer and perf hooks called.
> + *
> + * @tsk: Pointer to "struct task_struct".
> + * @buf: New comm name.
> + *
> + * Returns nothing.
> + */
> +void set_task_comm(struct task_struct *tsk, char *buf)
> +{
> +	/*
> +	 * Question: Do we need to use task_lock()/task_unlock() ?

The have-memory path through do_set_task_comm() does tolerate multiple
CPUs concurrently setting the comm of a given task, but the no-memory
path does not.  I advise keeping the locking, at least unless/until
some workload is having lots of CPUs concurrently changing a given
task's comm.  For some good reason, I hasten to add!  ;-)

Another reason to get rid of the lock is to allow do_set_task_comm()
to sleep waiting for memory to become available, but not sure that
this is a good approach.

> +	 */
> +	task_lock(tsk);
> +	trace_task_rename(tsk, buf);
> +	task_unlock(tsk);
> +	do_set_task_comm(tsk, buf);
> +	perf_event_comm(tsk);
> +}
> +
> +/**
> + * do_set_task_comm_fmt - Change task_struct->comm.
> + *
> + * @tsk: Pointer to "struct task_struct".
> + * @fmt: Format string, followed by arguments.
> + *
> + * Returns nothing.
> + */
> +void do_set_task_comm_fmt(struct task_struct *tsk, const char *fmt, ...)
> +{
> +	va_list args;
> +	char name[TASK_COMM_LEN];
> +
> +	va_start(args, fmt);
> +	vsnprintf(name, TASK_COMM_LEN, fmt, args);
> +	va_end(args);
> +	do_set_task_comm(tsk, name);
> +}
> +
> +/**
> + * do_set_task_comm - Change task_struct->comm.
> + *
> + * @tsk: Pointer to "struct task_struct".
> + * @buf: New comm name.
> + *
> + * Returns nothing.
> + */
> +void do_set_task_comm(struct task_struct *tsk, const char *buf)
> +{
> +	struct rcu_comm *comm;
> +
> +	comm = kmem_cache_zalloc(commname_cachep, GFP_ATOMIC | __GFP_NOWARN);
> +	if (likely(comm)) {
> +		atomic_set(&comm->usage, 1);
> +		strncpy(comm->name, buf, TASK_COMM_LEN - 1);
> +		smp_wmb(); /* Behave like rcu_assign_pointer(). */
> +		comm = xchg(&tsk->rcu_comm, comm);

The value-returning xchg() implies a full memory barrier, so the
smp_wmb() is not needed.

> +		put_commname(comm);
> +	} else {
> +		/*
> +		 * Memory allocation failed. We have to tolerate loss of
> +		 * consistency.
> +		 *
> +		 * Question 1: Do we want to reserve some amount of static
> +		 * "struct rcu_comm" arrays for use upon allocation failures?
> +		 *
> +		 * Question 2: Do we perfer unchanged comm name over
> +		 * overwritten comm name because comm name is copy-on-write ?
> +		 */
> +		WARN_ONCE(1, "Failed to allocate memory for comm name.\n");
> +		rcu_read_lock();
> +		do {
> +			comm = rcu_dereference(tsk->rcu_comm);
> +		} while (!atomic_read(&comm->usage));

So the idea here is to avoid races with someone who might be freeing
this same ->rcu_comm structure, right?  If we see a non-zero reference,
they might still free it, but that would be OK because we are in an
RCU read-side critical section, so it won't actually be freed until
we get done.  And our change is being overwritten by someone else's
in that case, but that is OK because it could happen anyway.

So the loop shouldn't go through many cycles, especially if memory
remains low.

If I am correct, might be worth a comment.  Doubly so if I am wrong.  ;-)

> +		strncpy(comm->name, buf, TASK_COMM_LEN - 1);
> +		rcu_read_unlock();
> +	}
> +}
> +
> +/**
> + * free_commname - Release "struct rcu_comm".
> + *
> + * @rcu: Pointer to "struct rcu_head".
> + *
> + * Returns nothing.
> + */
> +static void free_commname(struct rcu_head *rcu)
> +{
> +	struct rcu_comm *comm = container_of(rcu, struct rcu_comm, rcu);
> +	kmem_cache_free(commname_cachep, comm);
> +}
> +
> +/**
> + * put_commname - Release "struct rcu_comm".
> + *
> + * @comm: Pointer to "struct rcu_comm".
> + *
> + * Returns nothing.
> + */
> +void put_commname(struct rcu_comm *comm)
> +{
> +	if (atomic_dec_and_test(&comm->usage))
> +		call_rcu(&comm->rcu, free_commname);
> +}
> +
> +/**
> + * commname_init - Initialize "struct rcu_comm".
> + *
> + * Returns nothing.
> + */
> +void __init commname_init(void)
> +{
> +	commname_cachep = kmem_cache_create("commname",
> +					    sizeof(struct rcu_comm),
> +					    0, SLAB_PANIC, NULL);
> +}
> diff --git a/kernel/fork.c b/kernel/fork.c
> index a17621c..23be1ef 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -214,6 +214,7 @@ void free_task(struct task_struct *tsk)
>  	ftrace_graph_exit_task(tsk);
>  	put_seccomp_filter(tsk);
>  	arch_release_task_struct(tsk);
> +	put_commname(tsk->rcu_comm);
>  	free_task_struct(tsk);
>  }
>  EXPORT_SYMBOL(free_task);
> @@ -249,6 +250,8 @@ EXPORT_SYMBOL_GPL(__put_task_struct);
> 
>  void __init __weak arch_task_cache_init(void) { }
> 
> +extern void __init commname_init(void);
> +
>  void __init fork_init(unsigned long mempages)
>  {
>  #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
> @@ -260,6 +263,7 @@ void __init fork_init(unsigned long mempages)
>  		kmem_cache_create("task_struct", sizeof(struct task_struct),
>  			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
>  #endif
> +	commname_init();
> 
>  	/* do the arch specific task caches init */
>  	arch_task_cache_init();
> @@ -1191,6 +1195,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>  	p = dup_task_struct(current);
>  	if (!p)
>  		goto fork_out;
> +	rcu_read_lock();
> +	do {
> +		p->rcu_comm = rcu_dereference(current->rcu_comm);
> +	} while (!atomic_inc_not_zero(&p->rcu_comm->usage));

OK, loop until we get the sole reference on the comm...  But
I suggest an rcu_read_unlock() followed by an rcu_read_lock() as the
first thing in the loop.  Just to prevent adding an RCU CPU stall to
our woes if we cannot get a reference.

And here the two tasks (parent and child) share the name until one
or the other either changes its name or exits.  OK.

> +	rcu_read_unlock();
> 
>  	ftrace_graph_init_task(p);
>  	get_seccomp_filter(p);
> diff --git a/kernel/kthread.c b/kernel/kthread.c
> index b5ae3ee..e83b67c 100644
> --- a/kernel/kthread.c
> +++ b/kernel/kthread.c
> @@ -309,10 +309,12 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
>  	if (!IS_ERR(task)) {
>  		static const struct sched_param param = { .sched_priority = 0 };
>  		va_list args;
> +		char name[TASK_COMM_LEN];
> 
>  		va_start(args, namefmt);
> -		vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
> +		vsnprintf(name, TASK_COMM_LEN, namefmt, args);
>  		va_end(args);
> +		do_set_task_comm(task, name);
>  		/*
>  		 * root may have changed our (kthreadd's) priority or CPU mask.
>  		 * The kernel thread should not inherit these properties.
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index b46131e..bbb7c94 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4488,7 +4488,7 @@ void init_idle(struct task_struct *idle, int cpu)
>  	ftrace_graph_init_idle_task(idle, cpu);
>  	vtime_init_idle(idle, cpu);
>  #if defined(CONFIG_SMP)
> -	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
> +	do_set_task_comm_fmt(idle, "swapper/%d", cpu);
>  #endif
>  }
> 
> -- 
> 1.7.1
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Change task_struct->comm to use RCU.
  2014-02-17 11:27           ` [PATCH] " Tetsuo Handa
  2014-02-24 23:51             ` Paul E. McKenney
@ 2014-02-25  1:49             ` Lai Jiangshan
  2014-02-25 10:05               ` Peter Zijlstra
  2014-02-25 12:54               ` Tetsuo Handa
  1 sibling, 2 replies; 20+ messages in thread
From: Lai Jiangshan @ 2014-02-25  1:49 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: akpm, joe, keescook, geert, jkosina, viro, davem, linux-kernel,
	Ingo Molnar, Peter Zijlstra, Steven Rostedt, Thomas Gleixner

CC scheduler people.

I can't figure out what we get with this patch.

On 02/17/2014 07:27 PM, Tetsuo Handa wrote:
> Tetsuo Handa wrote:
>> This is a draft patch which changes task_struct->comm to use RCU.
> 
> Changes from previous draft version:
> 
>   Changed "struct rcu_comm" to use copy-on-write approach. Those multi-thread
>   or multi-process applications which do not change comm name will consume
>   memory for only one "struct rcu_comm".
> 
>   Changed do_commset() not to sleep.
> 
>   Changed to tolerate loss of consistency when memory allocation failed.
> 
>   Fixed race condition in copy_process().
> 
> Regards.
> ----------
>>From ada6c4d94f5afda36c7c21869d38b7111a6fe9bc Mon Sep 17 00:00:00 2001
> From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Date: Mon, 17 Feb 2014 14:32:11 +0900
> Subject: [PATCH] Change task_struct->comm to use RCU.
> 
> This patch changes task_struct->comm to be updated using RCU
> (unless memory allocation fails).
> 
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> ---
>  fs/exec.c                 |  145 +++++++++++++++++++++++++++++++++++++++------
>  include/linux/init_task.h |    4 +-
>  include/linux/sched.h     |   37 ++++++++++--
>  kernel/fork.c             |    9 +++
>  kernel/kthread.c          |    4 +-
>  kernel/sched/core.c       |    2 +-
>  6 files changed, 173 insertions(+), 28 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a781dec..8a68ab3 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -263,6 +263,11 @@ extern char ___assert_task_state[1 - 2*!!(
>  
>  /* Task command name length */
>  #define TASK_COMM_LEN 16
> +struct rcu_comm {
> +	char name[TASK_COMM_LEN];
> +	atomic_t usage;
> +	struct rcu_head rcu;
> +};
>  


I think the name "rcu_comm" is not good, I suggest that s/rcu_comm/task_comm/g.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Change task_struct->comm to use RCU.
  2014-02-25  1:49             ` Lai Jiangshan
@ 2014-02-25 10:05               ` Peter Zijlstra
  2014-02-25 12:54               ` Tetsuo Handa
  1 sibling, 0 replies; 20+ messages in thread
From: Peter Zijlstra @ 2014-02-25 10:05 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: Tetsuo Handa, akpm, joe, keescook, geert, jkosina, viro, davem,
	linux-kernel, Ingo Molnar, Steven Rostedt, Thomas Gleixner

On Tue, Feb 25, 2014 at 09:49:40AM +0800, Lai Jiangshan wrote:
> >>From ada6c4d94f5afda36c7c21869d38b7111a6fe9bc Mon Sep 17 00:00:00 2001
> > From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> > Date: Mon, 17 Feb 2014 14:32:11 +0900
> > Subject: [PATCH] Change task_struct->comm to use RCU.
> > 
> > This patch changes task_struct->comm to be updated using RCU
> > (unless memory allocation fails).

This Changelog sucks; it fails to describe a problem; therefore there is
no fix and the patch goes away, *poof*.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Change task_struct->comm to use RCU.
  2014-02-25  1:49             ` Lai Jiangshan
  2014-02-25 10:05               ` Peter Zijlstra
@ 2014-02-25 12:54               ` Tetsuo Handa
  2014-02-25 14:46                 ` Peter Zijlstra
  1 sibling, 1 reply; 20+ messages in thread
From: Tetsuo Handa @ 2014-02-25 12:54 UTC (permalink / raw)
  To: laijs
  Cc: akpm, joe, keescook, geert, jkosina, viro, davem, linux-kernel,
	mingo, peterz, rostedt, tglx

Lai Jiangshan wrote:
> CC scheduler people.
> 
> I can't figure out what we get with this patch.
> 
OK. Welcome to this thread. I'll explain you what is going on.

Current problem:

  printk("%s\n", task->comm) is racy because "%s" format specifier assumes that
  the corresponding argument does not change between strnlen() and the for loop
  at string() in lib/vsnprintf.c . If task->comm was "Hello Linux" until
  strnlen() and becomes "Penguin" before the for loop, "%s" will emit
  "Penguin\0nux" (note the unexpected '\0' byte and the garbage bytes).

  Likewise, audit_log_untrustedstring(ab, current->comm) is racy.
  If task->comm was "Hello Linux" until audit_string_contains_control() in
  audit_log_n_untrustedstring() returns false, and becomes "Penguin" before
  memcpy() in audit_log_n_string() is called, memcpy() will emit "Penguin\0nux"
  into the audit log, which results in loss of information (e.g. SELinux
  context) due to the unexpected '\0' byte.

Proposed solution:

  To fix abovementioned problem, I proposed commcpy() and "%pT" format
  specifier which does

    char tmp[16];
    memcpy(tmp, task->comm, 16);
    tmp[15] = '\0';
    sprintf(buf, "%s", tmp);

  instead of

    sprintf(buf, "%s", task->comm);

  .

Remaining problem:

  Although the proposed solution will prevent the caller from emitting the
  unexpected '\0' byte and the garbage bytes, memcpy(tmp, task->comm, 16) in
  the proposed solution is not atomic. That is, "%pT" does not emit the '\0'
  byte like "Penguin\0nux" but "%pT" still might emit "Penguininux".

  To fix this problem, I proposed protecting memcpy(tmp, task->comm, 16) part
  using RCU. This patch is a design for how the update side of task->comm will
  look like if we use RCU approach.

  Of course, this approach depends on that nobody prefers the speed of reading
  task->comm over the atomicity of reading task->comm . If somebody strongly
  objects on the cost of calling rcu_read_lock()/rcu_read_unlock() for the
  atomicity, I'm fine without this patch.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Change task_struct->comm to use RCU.
  2014-02-25 12:54               ` Tetsuo Handa
@ 2014-02-25 14:46                 ` Peter Zijlstra
  2014-03-07 12:20                   ` Tetsuo Handa
  0 siblings, 1 reply; 20+ messages in thread
From: Peter Zijlstra @ 2014-02-25 14:46 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: laijs, akpm, joe, keescook, geert, jkosina, viro, davem,
	linux-kernel, mingo, rostedt, tglx

On Tue, Feb 25, 2014 at 09:54:01PM +0900, Tetsuo Handa wrote:
> Lai Jiangshan wrote:
> > CC scheduler people.
> > 
> > I can't figure out what we get with this patch.
> > 
> OK. Welcome to this thread. I'll explain you what is going on.
> 
> Current problem:
> 
>   printk("%s\n", task->comm) is racy because "%s" format specifier assumes that
>   the corresponding argument does not change between strnlen() and the for loop
>   at string() in lib/vsnprintf.c . If task->comm was "Hello Linux" until
>   strnlen() and becomes "Penguin" before the for loop, "%s" will emit
>   "Penguin\0nux" (note the unexpected '\0' byte and the garbage bytes).

I would have actually expected it to stop emitting chars at \0. But
sure. Couldn't care less though; that's what you get, we all know this,
we've all been through this discussion several times. Get over it
already.

One of the last threads on this is:

  https://lkml.org/lkml/2011/5/17/516

>   Likewise, audit_log_untrustedstring(ab, current->comm) is racy.
>   If task->comm was "Hello Linux" until audit_string_contains_control() in
>   audit_log_n_untrustedstring() returns false, and becomes "Penguin" before
>   memcpy() in audit_log_n_string() is called, memcpy() will emit "Penguin\0nux"
>   into the audit log, which results in loss of information (e.g. SELinux
>   context) due to the unexpected '\0' byte.

I expect the audit people don't like this? Also, how do audit and the
LSM crap things interact? I thought they were both different piles of
ignorable goo?

See there's not actually a problem statement here at all, so you can't
go about proposing solutions quite yet.

> Proposed solution:
> 
>   To fix abovementioned problem, I proposed commcpy() and "%pT" format
>   specifier which does
> 
>     char tmp[16];
>     memcpy(tmp, task->comm, 16);
>     tmp[15] = '\0';
>     sprintf(buf, "%s", tmp);
> 
>   instead of
> 
>     sprintf(buf, "%s", task->comm);
> 
>   .

How about you do what you're supposed to do when you want a reliable
->comm and use get_task_comm()?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Change task_struct->comm to use RCU.
  2014-02-24 23:51             ` Paul E. McKenney
@ 2014-02-26 13:44               ` Tetsuo Handa
  2014-02-26 15:26                 ` Paul E. McKenney
  0 siblings, 1 reply; 20+ messages in thread
From: Tetsuo Handa @ 2014-02-26 13:44 UTC (permalink / raw)
  To: paulmck; +Cc: akpm, joe, keescook, geert, jkosina, viro, davem, linux-kernel

Thank you for reviewing, Paul.

Paul E. McKenney wrote:
> > +/**
> > + * set_task_comm - Change task_struct->comm with tracer and perf hooks called.
> > + *
> > + * @tsk: Pointer to "struct task_struct".
> > + * @buf: New comm name.
> > + *
> > + * Returns nothing.
> > + */
> > +void set_task_comm(struct task_struct *tsk, char *buf)
> > +{
> > +	/*
> > +	 * Question: Do we need to use task_lock()/task_unlock() ?
> 
> The have-memory path through do_set_task_comm() does tolerate multiple
> CPUs concurrently setting the comm of a given task, but the no-memory
> path does not.  I advise keeping the locking, at least unless/until
> some workload is having lots of CPUs concurrently changing a given
> task's comm.  For some good reason, I hasten to add!  ;-)
> 
That question meant whether trace_task_rename(tsk, buf) needs to be serialized
by tsk->alloc_lock. If trace_task_rename(tsk, buf) doesn't need to be
serialized by tsk->alloc_lock, we can remove task_lock()/task_unlock().

> Another reason to get rid of the lock is to allow do_set_task_comm()
> to sleep waiting for memory to become available, but not sure that
> this is a good approach.

I used GFP_ATOMIC because I don't know whether there are callers that depend on
"changing task->comm does not sleep", for until today "changing task->comm did
not sleep".

> > +void do_set_task_comm(struct task_struct *tsk, const char *buf)
> > +{
> > +	struct rcu_comm *comm;
> > +
> > +	comm = kmem_cache_zalloc(commname_cachep, GFP_ATOMIC | __GFP_NOWARN);
> > +	if (likely(comm)) {
> > +		atomic_set(&comm->usage, 1);
> > +		strncpy(comm->name, buf, TASK_COMM_LEN - 1);
> > +		smp_wmb(); /* Behave like rcu_assign_pointer(). */
> > +		comm = xchg(&tsk->rcu_comm, comm);
> 
> The value-returning xchg() implies a full memory barrier, so the
> smp_wmb() is not needed.
> 
I see.

> > +		put_commname(comm);
> > +	} else {
> > +		/*
> > +		 * Memory allocation failed. We have to tolerate loss of
> > +		 * consistency.
> > +		 *
> > +		 * Question 1: Do we want to reserve some amount of static
> > +		 * "struct rcu_comm" arrays for use upon allocation failures?
> > +		 *
> > +		 * Question 2: Do we perfer unchanged comm name over
> > +		 * overwritten comm name because comm name is copy-on-write ?
> > +		 */
> > +		WARN_ONCE(1, "Failed to allocate memory for comm name.\n");
> > +		rcu_read_lock();
> > +		do {
> > +			comm = rcu_dereference(tsk->rcu_comm);
> > +		} while (!atomic_read(&comm->usage));
> 
> So the idea here is to avoid races with someone who might be freeing
> this same ->rcu_comm structure, right?  If we see a non-zero reference,
> they might still free it, but that would be OK because we are in an
> RCU read-side critical section, so it won't actually be freed until
> we get done.  And our change is being overwritten by someone else's
> in that case, but that is OK because it could happen anyway.
> 
Right.

> So the loop shouldn't go through many cycles, especially if memory
> remains low.
> 
> If I am correct, might be worth a comment.  Doubly so if I am wrong.  ;-)
> 
You are correct.

What about Q1 and Q2, like below ?

  /* Usage is initialized with ATOMIC_INIT(-1). */
  static struct rcu_comm static_rcu_comm[CONFIG_RESERVED_COMMBUFFER];

  static void free_commname(struct rcu_head *rcu)
  {
  	struct rcu_comm *comm = container_of(rcu, struct rcu_comm, rcu);
  	if (comm < &static_rcu_comm[0] ||
  	    comm >= &static_rcu_comm[CONFIG_RESERVED_COMMBUFFER])
  		kmem_cache_free(commname_cachep, comm);
  	else
  		atomic_set(&comm->usage, -1);
  }

  void do_set_task_comm(struct task_struct *tsk, const char *buf)
  {
  	struct rcu_comm *comm;
  
  	comm = kmem_cache_zalloc(commname_cachep, GFP_ATOMIC | __GFP_NOWARN);
  	if (likely(comm))
  		atomic_set(&comm->usage, 1);
  	else {
  		int i;
  
  		/* Memory allocation failed. Try static comm name buffer. */
  		for (i = 0; i < CONFIG_RESERVED_COMMBUFFER; i++) {
  			comm = &static_rcu_comm[i];
  			if (atomic_read(&comm->usage) != -1)
  				continue;
  			if (atomic_add_return(&comm->usage, 2) == 1)
  				goto found;
  		  	put_commname(comm);
  		  	put_commname(comm);
  		}
  		/* No comm name buffer available. Keep current comm name. */
  		WARN_ONCE(1, "Failed to allocate memory for comm name.\n");
  		return;
  	}
  found:
  	strncpy(comm->name, buf, TASK_COMM_LEN - 1);
  	comm = xchg(&tsk->rcu_comm, comm);
  	put_commname(comm);
  }

Since we can preallocate rcu_comm using GFP_KERNEL and return -ENOMEM (e.g.
prepare_bprm_creds() for execve() case, copy_from_user() for prctl(PR_SET_NAME)
and comm_write() (i.e. /proc/$pid/comm ) cases), there will be little users of
static_rcu_comm[] (e.g. kthreadd()).


> > @@ -1191,6 +1195,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> >  	p = dup_task_struct(current);
> >  	if (!p)
> >  		goto fork_out;
> > +	rcu_read_lock();
> > +	do {
> > +		p->rcu_comm = rcu_dereference(current->rcu_comm);
> > +	} while (!atomic_inc_not_zero(&p->rcu_comm->usage));
> 
> OK, loop until we get the sole reference on the comm...  But
> I suggest an rcu_read_unlock() followed by an rcu_read_lock() as the
> first thing in the loop.  Just to prevent adding an RCU CPU stall to
> our woes if we cannot get a reference.
> 
I see. I will use

  p->rcu_comm = NULL;
  do {
  	struct rcu_comm *comm;
  	rcu_read_lock();
  	comm = rcu_dereference(current->rcu_comm);
  	if (atomic_inc_not_zero(&comm->usage))
  		p->rcu_comm = comm;
  	rcu_read_unlock();
  } while (!p->rcu_comm);

in the next version (if we use RCU approach).

> And here the two tasks (parent and child) share the name until one
> or the other either changes its name or exits.  OK.
> 
Yes, this is copy on write approach.

Another approach could use heuristic atomicity; change from "char comm[16]"
to "long comm[16/sizeof(long)]" and read/write using "long". By using "long",
we can reduce the possibility of getting the mixture of current comm name and
comm name to update to, for reading/writing of "long" is atomic.

The horizontal axis of below map is the strlen() of current comm name and the
vertical axis is the strlen() of comm name to update to. If sizeof(long) == 8,
193 out of 256 patterns can be atomically updated without any locks. Moreover,
since strlen() of at least one of current comm name and new comm name tends to
be shorter than sizeof(long) (e.g. "bash" -> "ls", "rc.sysinit" -> "cat",
"bash" -> "avahi-daemon" ), update of comm name will more likely be atomic
without any locks.

    | 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
  --+-----------------------------------------------
   0| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
   1| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
   2| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
   3| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
   4| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
   5| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
   6| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
   7| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
   8| o  o  o  o  o  o  o  o  o  x  x  x  x  x  x  x
   9| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
  10| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
  11| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
  12| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
  13| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
  14| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
  15| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x

> > +	rcu_read_unlock();
> > 
> >  	ftrace_graph_init_task(p);
> >  	get_seccomp_filter(p);

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Change task_struct->comm to use RCU.
  2014-02-26 13:44               ` Tetsuo Handa
@ 2014-02-26 15:26                 ` Paul E. McKenney
  0 siblings, 0 replies; 20+ messages in thread
From: Paul E. McKenney @ 2014-02-26 15:26 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: akpm, joe, keescook, geert, jkosina, viro, davem, linux-kernel

On Wed, Feb 26, 2014 at 10:44:32PM +0900, Tetsuo Handa wrote:
> Thank you for reviewing, Paul.

No problem, but you do also need to address Lai Jiangshan's and
Peter Zijlstra's questions about the purpose of this patch.  I was
looking at it only from a "does it work?" viewpoint.

> Paul E. McKenney wrote:
> > > +/**
> > > + * set_task_comm - Change task_struct->comm with tracer and perf hooks called.
> > > + *
> > > + * @tsk: Pointer to "struct task_struct".
> > > + * @buf: New comm name.
> > > + *
> > > + * Returns nothing.
> > > + */
> > > +void set_task_comm(struct task_struct *tsk, char *buf)
> > > +{
> > > +	/*
> > > +	 * Question: Do we need to use task_lock()/task_unlock() ?
> > 
> > The have-memory path through do_set_task_comm() does tolerate multiple
> > CPUs concurrently setting the comm of a given task, but the no-memory
> > path does not.  I advise keeping the locking, at least unless/until
> > some workload is having lots of CPUs concurrently changing a given
> > task's comm.  For some good reason, I hasten to add!  ;-)
> > 
> That question meant whether trace_task_rename(tsk, buf) needs to be serialized
> by tsk->alloc_lock. If trace_task_rename(tsk, buf) doesn't need to be
> serialized by tsk->alloc_lock, we can remove task_lock()/task_unlock().

I was attempting ot answer that question.  ;-)

> > Another reason to get rid of the lock is to allow do_set_task_comm()
> > to sleep waiting for memory to become available, but not sure that
> > this is a good approach.
> 
> I used GFP_ATOMIC because I don't know whether there are callers that depend on
> "changing task->comm does not sleep", for until today "changing task->comm did
> not sleep".

OK...

> > > +void do_set_task_comm(struct task_struct *tsk, const char *buf)
> > > +{
> > > +	struct rcu_comm *comm;
> > > +
> > > +	comm = kmem_cache_zalloc(commname_cachep, GFP_ATOMIC | __GFP_NOWARN);
> > > +	if (likely(comm)) {
> > > +		atomic_set(&comm->usage, 1);
> > > +		strncpy(comm->name, buf, TASK_COMM_LEN - 1);
> > > +		smp_wmb(); /* Behave like rcu_assign_pointer(). */
> > > +		comm = xchg(&tsk->rcu_comm, comm);
> > 
> > The value-returning xchg() implies a full memory barrier, so the
> > smp_wmb() is not needed.
> > 
> I see.
> 
> > > +		put_commname(comm);
> > > +	} else {
> > > +		/*
> > > +		 * Memory allocation failed. We have to tolerate loss of
> > > +		 * consistency.
> > > +		 *
> > > +		 * Question 1: Do we want to reserve some amount of static
> > > +		 * "struct rcu_comm" arrays for use upon allocation failures?
> > > +		 *
> > > +		 * Question 2: Do we perfer unchanged comm name over
> > > +		 * overwritten comm name because comm name is copy-on-write ?
> > > +		 */
> > > +		WARN_ONCE(1, "Failed to allocate memory for comm name.\n");
> > > +		rcu_read_lock();
> > > +		do {
> > > +			comm = rcu_dereference(tsk->rcu_comm);
> > > +		} while (!atomic_read(&comm->usage));
> > 
> > So the idea here is to avoid races with someone who might be freeing
> > this same ->rcu_comm structure, right?  If we see a non-zero reference,
> > they might still free it, but that would be OK because we are in an
> > RCU read-side critical section, so it won't actually be freed until
> > we get done.  And our change is being overwritten by someone else's
> > in that case, but that is OK because it could happen anyway.
> > 
> Right.
> 
> > So the loop shouldn't go through many cycles, especially if memory
> > remains low.
> > 
> > If I am correct, might be worth a comment.  Doubly so if I am wrong.  ;-)
> > 
> You are correct.
> 
> What about Q1 and Q2, like below ?

I suggest reviewing the LKML thread that Peter Zijlstra pointed you at
before digging too much further into this.  Especially this one:

	https://lkml.org/lkml/2011/5/18/408

							Thanx, Paul

>   /* Usage is initialized with ATOMIC_INIT(-1). */
>   static struct rcu_comm static_rcu_comm[CONFIG_RESERVED_COMMBUFFER];
> 
>   static void free_commname(struct rcu_head *rcu)
>   {
>   	struct rcu_comm *comm = container_of(rcu, struct rcu_comm, rcu);
>   	if (comm < &static_rcu_comm[0] ||
>   	    comm >= &static_rcu_comm[CONFIG_RESERVED_COMMBUFFER])
>   		kmem_cache_free(commname_cachep, comm);
>   	else
>   		atomic_set(&comm->usage, -1);
>   }
> 
>   void do_set_task_comm(struct task_struct *tsk, const char *buf)
>   {
>   	struct rcu_comm *comm;
>   
>   	comm = kmem_cache_zalloc(commname_cachep, GFP_ATOMIC | __GFP_NOWARN);
>   	if (likely(comm))
>   		atomic_set(&comm->usage, 1);
>   	else {
>   		int i;
>   
>   		/* Memory allocation failed. Try static comm name buffer. */
>   		for (i = 0; i < CONFIG_RESERVED_COMMBUFFER; i++) {
>   			comm = &static_rcu_comm[i];
>   			if (atomic_read(&comm->usage) != -1)
>   				continue;
>   			if (atomic_add_return(&comm->usage, 2) == 1)
>   				goto found;
>   		  	put_commname(comm);
>   		  	put_commname(comm);
>   		}
>   		/* No comm name buffer available. Keep current comm name. */
>   		WARN_ONCE(1, "Failed to allocate memory for comm name.\n");
>   		return;
>   	}
>   found:
>   	strncpy(comm->name, buf, TASK_COMM_LEN - 1);
>   	comm = xchg(&tsk->rcu_comm, comm);
>   	put_commname(comm);
>   }
> 
> Since we can preallocate rcu_comm using GFP_KERNEL and return -ENOMEM (e.g.
> prepare_bprm_creds() for execve() case, copy_from_user() for prctl(PR_SET_NAME)
> and comm_write() (i.e. /proc/$pid/comm ) cases), there will be little users of
> static_rcu_comm[] (e.g. kthreadd()).
> 
> 
> > > @@ -1191,6 +1195,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> > >  	p = dup_task_struct(current);
> > >  	if (!p)
> > >  		goto fork_out;
> > > +	rcu_read_lock();
> > > +	do {
> > > +		p->rcu_comm = rcu_dereference(current->rcu_comm);
> > > +	} while (!atomic_inc_not_zero(&p->rcu_comm->usage));
> > 
> > OK, loop until we get the sole reference on the comm...  But
> > I suggest an rcu_read_unlock() followed by an rcu_read_lock() as the
> > first thing in the loop.  Just to prevent adding an RCU CPU stall to
> > our woes if we cannot get a reference.
> > 
> I see. I will use
> 
>   p->rcu_comm = NULL;
>   do {
>   	struct rcu_comm *comm;
>   	rcu_read_lock();
>   	comm = rcu_dereference(current->rcu_comm);
>   	if (atomic_inc_not_zero(&comm->usage))
>   		p->rcu_comm = comm;
>   	rcu_read_unlock();
>   } while (!p->rcu_comm);
> 
> in the next version (if we use RCU approach).
> 
> > And here the two tasks (parent and child) share the name until one
> > or the other either changes its name or exits.  OK.
> > 
> Yes, this is copy on write approach.
> 
> Another approach could use heuristic atomicity; change from "char comm[16]"
> to "long comm[16/sizeof(long)]" and read/write using "long". By using "long",
> we can reduce the possibility of getting the mixture of current comm name and
> comm name to update to, for reading/writing of "long" is atomic.
> 
> The horizontal axis of below map is the strlen() of current comm name and the
> vertical axis is the strlen() of comm name to update to. If sizeof(long) == 8,
> 193 out of 256 patterns can be atomically updated without any locks. Moreover,
> since strlen() of at least one of current comm name and new comm name tends to
> be shorter than sizeof(long) (e.g. "bash" -> "ls", "rc.sysinit" -> "cat",
> "bash" -> "avahi-daemon" ), update of comm name will more likely be atomic
> without any locks.
> 
>     | 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
>   --+-----------------------------------------------
>    0| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
>    1| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
>    2| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
>    3| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
>    4| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
>    5| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
>    6| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
>    7| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
>    8| o  o  o  o  o  o  o  o  o  x  x  x  x  x  x  x
>    9| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
>   10| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
>   11| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
>   12| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
>   13| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
>   14| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
>   15| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
> 
> > > +	rcu_read_unlock();
> > > 
> > >  	ftrace_graph_init_task(p);
> > >  	get_seccomp_filter(p);
> 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Change task_struct->comm to use RCU.
  2014-02-25 14:46                 ` Peter Zijlstra
@ 2014-03-07 12:20                   ` Tetsuo Handa
  2014-03-07 15:54                     ` Richard Guy Briggs
  0 siblings, 1 reply; 20+ messages in thread
From: Tetsuo Handa @ 2014-03-07 12:20 UTC (permalink / raw)
  To: peterz, paulmck, rgb
  Cc: laijs, akpm, joe, keescook, geert, jkosina, viro, davem,
	linux-kernel, mingo, rostedt, tglx, linux-security-module

Peter Zijlstra wrote:
> I would have actually expected it to stop emitting chars at \0. But
> sure. Couldn't care less though; that's what you get, we all know this,
> we've all been through this discussion several times. Get over it
> already.
> 
> One of the last threads on this is:
> 
>   https://lkml.org/lkml/2011/5/17/516
> 
Yes, I knew the attempts to make comm access consistent. I was wondering why
such proposal is not yet accepted.

Thank you for pointing that thread out. I found the following comment in that
thread.

Linus Torvalds wrote:
| What folks?
| 
| I don't think a new lock (or any lock) is at all appropriate.
| 
| There's just no point. Just guarantee that the last byte is always
| zero, and you're done.
| 
| If you just guarantee that, THERE IS NO RACE. The last byte never
| changes. You may get odd half-way strings, but you've trivially
| guaranteed that they are C NUL-terminated, with no locking, no memory
| ordering, no nothing.

He stated that no locks are acceptable. Then, there is no room to insert
rcu_read_lock()/rcu_read_unlock() into the reader side. The only way which can
be implemented without any locks would be a heuristic atomicity shown later.

> >   Likewise, audit_log_untrustedstring(ab, current->comm) is racy.
> >   If task->comm was "Hello Linux" until audit_string_contains_control() in
> >   audit_log_n_untrustedstring() returns false, and becomes "Penguin" before
> >   memcpy() in audit_log_n_string() is called, memcpy() will emit "Penguin\0nux"
> >   into the audit log, which results in loss of information (e.g. SELinux
> >   context) due to the unexpected '\0' byte.
> 
> I expect the audit people don't like this? Also, how do audit and the
> LSM crap things interact? I thought they were both different piles of
> ignorable goo?
> 

I think the audit people do not like loss of information. Some of LSM modules
are using audit subsystem for recording security related events. An example is
shown later.

> See there's not actually a problem statement here at all, so you can't
> go about proposing solutions quite yet.
> 

Please see the patch.

> > Proposed solution:
> > 
> >   To fix abovementioned problem, I proposed commcpy() and "%pT" format
> >   specifier which does
> > 
> >     char tmp[16];
> >     memcpy(tmp, task->comm, 16);
> >     tmp[15] = '\0';
> >     sprintf(buf, "%s", tmp);
> > 
> >   instead of
> > 
> >     sprintf(buf, "%s", task->comm);
> > 
> >   .
> 
> How about you do what you're supposed to do when you want a reliable
> ->comm and use get_task_comm()?
> 

I always want a reliable ->comm . But get_task_comm() is not for calling from
vsnprintf(), for somebody might read task's commname from NMI context.
I tried to use RCU for reading from vsnprintf() but Linus will not accept it.

Regards.

----------

>From 212d983aa143954c089177942b6a7d9ab5393f8f Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 7 Mar 2014 11:00:24 +0900
Subject: [PATCH] Read/update task's commname sizeof(long) bytes at a time.

"struct task_struct"->comm is read from various locations, but it is not
updated atomically. As a result, readers may see a mixture of old and new
commname.

A bogus NUL byte in the mixture of old and new commname caused by TOCTTOU
race can have unwanted side effect. For example, a sequence described below
could happen when audit_log_untrustedstring(ab, tsk->comm) is called.

  (1) Initial tsk->comm contains "penguin-kernel\0\0"
  (2) CPU 0 attempts to audit tsk->comm using
      audit_log_untrustedstring(ab, tsk->comm)
  (3) audit_log_untrustedstring(ab, tsk->comm) calls
      audit_log_n_untrustedstring(ab, tsk->comm, strlen(tsk->comm))
  (4) strlen(tsk->comm) is called and it returns 14
  (5) audit_log_n_untrustedstring(ab, tsk->comm, 14) calls
      audit_string_contains_control(tsk->comm, 14)
  (6) audit_string_contains_control(tsk->comm, 14) returns 0
  (7) CPU 1 attempts to write tsk->comm using set_task_comm(tsk, "penguin")
  (8) CPU 1 holds tsk->alloc_lock
  (9) CPU 1 calls strlcpy(tsk->comm, "penguin", 16)
      and now tsk->comm contains "penguin\0kernel\0\0"
  (10) CPU 1 releases task_struct->alloc_lock
  (11) audit_log_n_string(ab, tsk->comm, 14) calls memcpy(ptr, tsk->comm, 14)
  (12) memcpy(ptr, tsk->comm, 14) will copy "penguin\0kernel"
  (13) CPU 0 appends the rest of information and enqueues the entire log
  (14) the audit daemon reads the entire log, but the information after
       "penguin\0" are lost because of erroneously copied '\0' byte.

Since system call auditing and LSM modules include task's commname in their
record, the straying NUL byte can unexpectedly truncate that record, leading
loss of information which should have been recorded.

This patch introduces do_get_task_comm() and do_set_task_comm(). The former
locklessly reads task's commname using "long" and the latter locklessly updates
task's commname using "long". By using "long", we can reduce the possibility of
getting the mixture of old and new commname, for reading/writing of "long" is
atomic.

The horizontal axis of below map is the strlen() of current commname and the
vertical axis is the strlen() of commname to update to. If sizeof(long) == 8,
193 out of 256 patterns can be atomically updated without any locks. Moreover,
since strlen() of at least one of current commname and new commname tends to
be shorter than sizeof(long) (e.g. "bash" -> "ls", "rc.sysinit" -> "cat",
"bash" -> "avahi-daemon" ), update of commname will more likely be atomic
without any locks.

    | 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
  --+-----------------------------------------------
   0| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
   1| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
   2| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
   3| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
   4| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
   5| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
   6| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
   7| o  o  o  o  o  o  o  o  o  o  o  o  o  o  o  o
   8| o  o  o  o  o  o  o  o  o  x  x  x  x  x  x  x
   9| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
  10| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
  11| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
  12| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
  13| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
  14| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x
  15| o  o  o  o  o  o  o  o  x  x  x  x  x  x  x  x

By converting all readers to use [do_]get_task_comm() and all writers to
use [do_]set_task_comm(), we get the following advantages.

  (1) The straying NUL byte will be completely removed.

  (2) The possibility of seeing the mixture of old and new commname is
      to some degree reduced. The possibility will become 0 depending on
      the length of commname.

An example of how to use do_get_task_comm() is shown below.

  -audit_log_untrustedstring(ab, tsk->comm);
  +char buf[TASK_COMM_LEN];
  +audit_log_untrustedstring(ab, do_get_task_comm(buf, tsk));

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 fs/exec.c             |   69 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/sched.h |   29 ++++++++++++++++++++
 2 files changed, 96 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a781dec..8e0a212 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1792,6 +1792,35 @@ static inline cputime_t task_gtime(struct task_struct *t)
 extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
 extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
+/**
+ * do_get_task_comm - Copy task_struct->comm to buffer.
+ *
+ * @buf: Buffer to copy @tsk->comm which must be at least TASK_COMM_LEN bytes.
+ * @tsk: Pointer to "struct task_struct".
+ *
+ * Returns @buf .
+ *
+ * Please use this wrapper function which reads @tsk->comm in a heuristic
+ * atomic way.
+ */
+static inline char *do_get_task_comm(char *buf, const struct task_struct *tsk)
+{
+	unsigned long *d = (unsigned long *) buf;
+	unsigned long *s = (unsigned long *) tsk->comm;
+#if BITS_PER_LONG == 32
+	*d++ = *s++;
+	*d++ = *s++;
+#endif
+	*d++ = *s++;
+	*d = *s;
+	/*
+	 * This is a safeguard which is used until all users are converted to
+	 * use [do_]set_task_comm().
+	 */
+	buf[TASK_COMM_LEN - 1] = '\0';
+	return buf;
+}
+
 /*
  * Per process flags
  */
diff --git a/fs/exec.c b/fs/exec.c
index 3d78fcc..7f66dcf 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1030,7 +1030,7 @@ char *get_task_comm(char *buf, struct task_struct *tsk)
 {
 	/* buf must be at least sizeof(tsk->comm) in size */
 	task_lock(tsk);
-	strncpy(buf, tsk->comm, sizeof(tsk->comm));
+	do_get_task_comm(buf, tsk);
 	task_unlock(tsk);
 	return buf;
 }
@@ -1041,11 +1041,76 @@ EXPORT_SYMBOL_GPL(get_task_comm);
  * so that a new one can be started
  */
 
+void do_set_task_comm(struct task_struct *tsk, char *buf)
+{
+	long new[TASK_COMM_LEN / sizeof(long)];
+	long *old = (unsigned long *) tsk->comm;
+	/*
+	 * This is a safeguard which is used until all users are converted to
+	 * use [do_]set_task_comm().
+	 */
+	const u8 old_len = strnlen((char *) old, TASK_COMM_LEN);
+	u8 new_len = 0;
+
+	BUILD_BUG_ON(TASK_COMM_LEN % sizeof(long));
+	/*
+	 * Copy to temporary buffer, for size of "buf" may be smaller than
+	 * TASK_COMM_LEN bytes. We do not use strnlen() + memcpy() in order to
+	 * avoid TOCTTOU race, in case "buf" changed while copying.
+	 */
+	while (new_len < TASK_COMM_LEN - 1 && *buf)
+		((char *) new)[new_len++] = *buf++;
+	while (new_len < TASK_COMM_LEN)
+		((char *) new)[new_len++] = '\0';
+	/*
+	 * Update tsk->comm in a heuristic atomic way.
+	 *
+	 * If writers update one byte at a time using string manipulating
+	 * functions, readers have at most 14 chances of race.
+	 *
+	 * If writers update sizeof(long) bytes at a time, readers have at most
+	 * 1 (for 64bits arch) or 3 (for 32bits arch) chances of race. And even
+	 * if a reader races, the reader never sees mixture of old and new
+	 * tsk->comm within each sizeof(long) bytes. Also, if length of old
+	 * tsk->comm is shorter than sizeof(long) bytes, readers who are using
+	 * do_get_task_comm() have no chance of reading partially updated
+	 * tsk->comm unless disturbed by other factors (e.g. interrupts,
+	 * preemption, memory access reordering).
+	 *
+	 * This preempt_disable_notrace() is meant for reducing the possibility
+	 * of disturbance, but not meant to be a perfect barrier; readers have
+	 * no locks and no memory access orderings after all.
+	 */
+	preempt_disable_notrace();
+	if (old_len < sizeof(long)) {
+		old[1] = new[1];
+#if BITS_PER_LONG == 32
+		old[2] = new[2];
+		old[3] = new[3];
+#endif
+		/*
+		 * This memory barrier is meant for reducing the possibility of
+		 * reading new[1] before new[0], but not meant to be a perfect
+		 * barrier.
+		 */
+		smp_wmb();
+		old[0] = new[0];
+	} else {
+		old[0] = new[0];
+		old[1] = new[1];
+#if BITS_PER_LONG == 32
+		old[2] = new[2];
+		old[3] = new[3];
+#endif
+	}
+	preempt_enable_no_resched_notrace();
+}
+
 void set_task_comm(struct task_struct *tsk, char *buf)
 {
 	task_lock(tsk);
 	trace_task_rename(tsk, buf);
-	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
+	do_set_task_comm(tsk, buf);
 	task_unlock(tsk);
 	perf_event_comm(tsk);
 }
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] Change task_struct->comm to use RCU.
  2014-03-07 12:20                   ` Tetsuo Handa
@ 2014-03-07 15:54                     ` Richard Guy Briggs
  2014-03-08 12:43                       ` Tetsuo Handa
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Guy Briggs @ 2014-03-07 15:54 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: peterz, paulmck, laijs, akpm, joe, keescook, geert, jkosina,
	viro, davem, linux-kernel, mingo, rostedt, tglx,
	linux-security-module

On 14/03/07, Tetsuo Handa wrote:
> Peter Zijlstra wrote:
> >   https://lkml.org/lkml/2011/5/17/516

> Thank you for pointing that thread out. I found the following comment in that
> thread.
> 
> Linus Torvalds wrote:
> | What folks?
> | 
> | I don't think a new lock (or any lock) is at all appropriate.
> | 
> | There's just no point. Just guarantee that the last byte is always
> | zero, and you're done.
> | 
> | If you just guarantee that, THERE IS NO RACE. The last byte never
> | changes. You may get odd half-way strings, but you've trivially
> | guaranteed that they are C NUL-terminated, with no locking, no memory
> | ordering, no nothing.

> > >   Likewise, audit_log_untrustedstring(ab, current->comm) is racy.
> > >   If task->comm was "Hello Linux" until audit_string_contains_control() in
> > >   audit_log_n_untrustedstring() returns false, and becomes "Penguin" before
> > >   memcpy() in audit_log_n_string() is called, memcpy() will emit "Penguin\0nux"
> > >   into the audit log, which results in loss of information (e.g. SELinux
> > >   context) due to the unexpected '\0' byte.
> > 
> > I expect the audit people don't like this? Also, how do audit and the
> > LSM crap things interact? I thought they were both different piles of
> > ignorable goo?
> 
> I think the audit people do not like loss of information. Some of LSM modules
> are using audit subsystem for recording security related events. An example is
> shown later.

This is true, however since comm it untrusted because it can be modified
by the user audit doesn't trust it anyways, so who cares?

> > How about you do what you're supposed to do when you want a reliable
> > ->comm and use get_task_comm()?
> 
> I always want a reliable ->comm . But get_task_comm() is not for calling from
> vsnprintf(), for somebody might read task's commname from NMI context.
> I tried to use RCU for reading from vsnprintf() but Linus will not accept it.

- RGB

--
Richard Guy Briggs <rbriggs@redhat.com>
Senior Software Engineer, Kernel Security, AMER ENG Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635, Alt: +1.613.693.0684x3545

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Change task_struct->comm to use RCU.
  2014-03-07 15:54                     ` Richard Guy Briggs
@ 2014-03-08 12:43                       ` Tetsuo Handa
  2014-03-10 20:21                         ` Richard Guy Briggs
  0 siblings, 1 reply; 20+ messages in thread
From: Tetsuo Handa @ 2014-03-08 12:43 UTC (permalink / raw)
  To: rgb
  Cc: peterz, paulmck, laijs, akpm, joe, keescook, geert, jkosina,
	viro, davem, linux-kernel, mingo, rostedt, tglx,
	linux-security-module

Richard Guy Briggs wrote:
> > > >   Likewise, audit_log_untrustedstring(ab, current->comm) is racy.
> > > >   If task->comm was "Hello Linux" until audit_string_contains_control() in
> > > >   audit_log_n_untrustedstring() returns false, and becomes "Penguin" before
> > > >   memcpy() in audit_log_n_string() is called, memcpy() will emit "Penguin\0nux"
> > > >   into the audit log, which results in loss of information (e.g. SELinux
> > > >   context) due to the unexpected '\0' byte.
> > > 
> > > I expect the audit people don't like this? Also, how do audit and the
> > > LSM crap things interact? I thought they were both different piles of
> > > ignorable goo?
> > 
> > I think the audit people do not like loss of information. Some of LSM modules
> > are using audit subsystem for recording security related events. An example is
> > shown later.
> 
> This is true, however since comm it untrusted because it can be modified
> by the user audit doesn't trust it anyways, so who cares?

Excuse me, but did you understand this side effect correctly?

# ln /bin/true /tmp/printable-comm
# auditctl -a exit,always -S execve -F path=/tmp/printable-comm
# /tmp/printable-comm
# cat /var/log/audit/audit.log

If we didn't race, everything is fine.

---------- An audit log without race ----------
type=SYSCALL msg=audit(1394281486.738:62): arch=40000003 syscall=11 success=yes exit=0 a0=8589c48 a1=85a6620 a2=858e4a0 a3=85a6620 items=2 ppid=1747 pid=2657 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts1 ses=2 comm="printable-comm" exe="/tmp/printable-comm" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null)
type=EXECVE msg=audit(1394281486.738:62): argc=1 a0="/tmp/printable-comm"
type=CWD msg=audit(1394281486.738:62):  cwd="/root"
type=PATH msg=audit(1394281486.738:62): item=0 name="/tmp/printable-comm" inode=1970955 dev=08:01 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:bin_t:s0 nametype=NORMAL
type=PATH msg=audit(1394281486.738:62): item=1 name=(null) inode=2360187 dev=08:01 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ld_so_t:s0 nametype=NORMAL
---------- An audit log without race ----------

But if we raced (you can use a (dangerous) SystemTap script shown below for
emulating this race condition

# stap -g -e '
function rewrite_comm(str:long) %{
  strlcpy((char *) (long) STAP_ARG_str, "truncated", sizeof(current->comm));
%}
probe kernel.function("audit_log_n_string") {
  if ($ab && $slen == 14 && kernel_string($string) == "printable-comm") {
    rewrite_comm($string); printf("<%s>\n", kernel_string($string))
  };
}
'

), you can see that fields after comm= (e.g. exe= subj= key= ) are missing.

---------- An audit log with race ----------
type=SYSCALL msg=audit(1394281498.566:63): arch=40000003 syscall=11 success=yes exit=0 a0=858c9c8 a1=85a6620 a2=858e4a0 a3=85a6620 items=2 ppid=1747 pid=2662 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts1 ses=2 comm="truncated
type=EXECVE msg=audit(1394281498.566:63): argc=1 a0="/tmp/printable-comm"
type=CWD msg=audit(1394281498.566:63):  cwd="/root"
type=PATH msg=audit(1394281498.566:63): item=0 name="/tmp/printable-comm" inode=1970955 dev=08:01 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:bin_t:s0 nametype=NORMAL
type=PATH msg=audit(1394281498.566:63): item=1 name=(null) inode=2360187 dev=08:01 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ld_so_t:s0 nametype=NORMAL
---------- An audit log with race ----------

Even if you don't trust the comm= field, it is annoying for me that fields
after comm= are missing in the audit log.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Change task_struct->comm to use RCU.
  2014-03-08 12:43                       ` Tetsuo Handa
@ 2014-03-10 20:21                         ` Richard Guy Briggs
  2014-03-11 12:02                           ` Tetsuo Handa
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Guy Briggs @ 2014-03-10 20:21 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: peterz, paulmck, laijs, akpm, joe, keescook, geert, jkosina,
	viro, davem, linux-kernel, mingo, rostedt, tglx,
	linux-security-module

On 14/03/08, Tetsuo Handa wrote:
> Richard Guy Briggs wrote:
> > > > >   Likewise, audit_log_untrustedstring(ab, current->comm) is racy.
> > > > >   If task->comm was "Hello Linux" until audit_string_contains_control() in
> > > > >   audit_log_n_untrustedstring() returns false, and becomes "Penguin" before
> > > > >   memcpy() in audit_log_n_string() is called, memcpy() will emit "Penguin\0nux"
> > > > >   into the audit log, which results in loss of information (e.g. SELinux
> > > > >   context) due to the unexpected '\0' byte.
> > > > 
> > > > I expect the audit people don't like this? Also, how do audit and the
> > > > LSM crap things interact? I thought they were both different piles of
> > > > ignorable goo?
> > > 
> > > I think the audit people do not like loss of information. Some of LSM modules
> > > are using audit subsystem for recording security related events. An example is
> > > shown later.
> > 
> > This is true, however since comm it untrusted because it can be modified
> > by the user audit doesn't trust it anyways, so who cares?
> 
> Excuse me, but did you understand this side effect correctly?

<snip>

> ), you can see that fields after comm= (e.g. exe= subj= key= ) are missing.

Ok, from your desciption and example I had clearly not fully understood
the problem.

> ---------- An audit log with race ----------
> type=SYSCALL msg=audit(1394281498.566:63): arch=40000003 syscall=11 success=yes exit=0 a0=858c9c8 a1=85a6620 a2=858e4a0 a3=85a6620 items=2 ppid=1747 pid=2662 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts1 ses=2 comm="truncated
> type=EXECVE msg=audit(1394281498.566:63): argc=1 a0="/tmp/printable-comm"
> type=CWD msg=audit(1394281498.566:63):  cwd="/root"
> type=PATH msg=audit(1394281498.566:63): item=0 name="/tmp/printable-comm" inode=1970955 dev=08:01 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:bin_t:s0 nametype=NORMAL
> type=PATH msg=audit(1394281498.566:63): item=1 name=(null) inode=2360187 dev=08:01 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ld_so_t:s0 nametype=NORMAL
> ---------- An audit log with race ----------
> 
> Even if you don't trust the comm= field, it is annoying for me that fields
> after comm= are missing in the audit log.

More than annoying, that isn't acceptable.

- RGB

--
Richard Guy Briggs <rbriggs@redhat.com>
Senior Software Engineer, Kernel Security, AMER ENG Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635, Alt: +1.613.693.0684x3545

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Change task_struct->comm to use RCU.
  2014-03-10 20:21                         ` Richard Guy Briggs
@ 2014-03-11 12:02                           ` Tetsuo Handa
  2014-03-11 12:16                             ` Tetsuo Handa
  0 siblings, 1 reply; 20+ messages in thread
From: Tetsuo Handa @ 2014-03-11 12:02 UTC (permalink / raw)
  To: rgb
  Cc: peterz, paulmck, laijs, akpm, joe, keescook, geert, jkosina,
	viro, davem, linux-kernel, mingo, rostedt, tglx,
	linux-security-module

Richard Guy Briggs wrote:
> > Even if you don't trust the comm= field, it is annoying for me that fields
> > after comm= are missing in the audit log.
> 
> More than annoying, that isn't acceptable.
> 
OK. If you are sure that it is safe to use get_task_comm() from
audit_log_task() and you prefer locked version, please pick up below patch
via your git tree.

If you are unsure or prefer lockless version, I'll make a lockless version
using do_get_task_comm() proposed in this thread.

----------
>From 88c3ff13efa10df6f4d4d0f2c243124ce6a3eaeb Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Tue, 11 Mar 2014 20:47:29 +0900
Subject: [PATCH] Audit: Pass comm name via get_task_comm()

When we pass task->comm to audit_log_untrustedstring(), we need to pass
a snapshot of it using get_task_comm(). Otherwise, we will lose fields
after comm= if we raced with updating task->comm.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 kernel/auditsc.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7aef2f4..ba57993 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2357,6 +2357,7 @@ static void audit_log_task(struct audit_buffer *ab)
 	kgid_t gid;
 	unsigned int sessionid;
 	struct mm_struct *mm = current->mm;
+	char name[TASK_COMM_LEN];
 
 	auid = audit_get_loginuid(current);
 	sessionid = audit_get_sessionid(current);
@@ -2369,7 +2370,7 @@ static void audit_log_task(struct audit_buffer *ab)
 			 sessionid);
 	audit_log_task_context(ab);
 	audit_log_format(ab, " pid=%d comm=", current->pid);
-	audit_log_untrustedstring(ab, current->comm);
+	audit_log_untrustedstring(ab, get_task_comm(name, current));
 	if (mm) {
 		down_read(&mm->mmap_sem);
 		if (mm->exe_file)
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] Change task_struct->comm to use RCU.
  2014-03-11 12:02                           ` Tetsuo Handa
@ 2014-03-11 12:16                             ` Tetsuo Handa
  2014-03-11 13:55                               ` James Morris
  0 siblings, 1 reply; 20+ messages in thread
From: Tetsuo Handa @ 2014-03-11 12:16 UTC (permalink / raw)
  To: jmorris
  Cc: rgb, peterz, paulmck, laijs, akpm, joe, keescook, geert, jkosina,
	viro, davem, linux-kernel, mingo, rostedt, tglx,
	linux-security-module

And the same phrase goes to James Morris...

If you are sure that it is safe to use get_task_comm() from
dump_common_audit_data() and you prefer locked version, please pick up below
patch via your git tree.

If you are unsure or prefer lockless version, I'll make a lockless version
using do_get_task_comm() proposed in this thread.

----------
>From 1fe3dcd8792bc1296e337daba661200a8feaf706 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Tue, 11 Mar 2014 21:09:48 +0900
Subject: [PATCH] LSM: Pass comm name via get_task_comm()

When we pass task->comm to audit_log_untrustedstring(), we need to pass
a snapshot of it using get_task_comm(). Otherwise, we will lose fields
after comm= if we raced with updating task->comm.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 security/lsm_audit.c |    5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 9a62045..6291c67 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -212,6 +212,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 				   struct common_audit_data *a)
 {
 	struct task_struct *tsk = current;
+	char name[TASK_COMM_LEN];
 
 	/*
 	 * To keep stack sizes in check force programers to notice if they
@@ -221,7 +222,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 	BUILD_BUG_ON(sizeof(a->u) > sizeof(void *)*2);
 
 	audit_log_format(ab, " pid=%d comm=", tsk->pid);
-	audit_log_untrustedstring(ab, tsk->comm);
+	audit_log_untrustedstring(ab, get_task_comm(name, tsk));
 
 	switch (a->type) {
 	case LSM_AUDIT_DATA_NONE:
@@ -280,7 +281,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 		tsk = a->u.tsk;
 		if (tsk && tsk->pid) {
 			audit_log_format(ab, " pid=%d comm=", tsk->pid);
-			audit_log_untrustedstring(ab, tsk->comm);
+			audit_log_untrustedstring(ab, get_task_comm(name, tsk));
 		}
 		break;
 	case LSM_AUDIT_DATA_NET:
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] Change task_struct->comm to use RCU.
  2014-03-11 12:16                             ` Tetsuo Handa
@ 2014-03-11 13:55                               ` James Morris
  2014-03-24 15:43                                 ` Richard Guy Briggs
  2014-03-27 17:20                                 ` [PATCH] LSM: Pass comm name via get_task_comm() [was: Re: [PATCH] Change task_struct->comm to use RCU.] Richard Guy Briggs
  0 siblings, 2 replies; 20+ messages in thread
From: James Morris @ 2014-03-11 13:55 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: rgb, peterz, paulmck, laijs, akpm, joe, keescook, geert, jkosina,
	viro, davem, linux-kernel, mingo, rostedt, tglx,
	linux-security-module

On Tue, 11 Mar 2014, Tetsuo Handa wrote:

> And the same phrase goes to James Morris...
> 
> If you are sure that it is safe to use get_task_comm() from
> dump_common_audit_data() and you prefer locked version, please pick up below
> patch via your git tree.
> 
> If you are unsure or prefer lockless version, I'll make a lockless version
> using do_get_task_comm() proposed in this thread.

If you can't understand whether your patch is correct or not, don't ask me 
to apply it to my tree.

If you're unsure, get it reviewed first.



- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] Change task_struct->comm to use RCU.
  2014-03-11 13:55                               ` James Morris
@ 2014-03-24 15:43                                 ` Richard Guy Briggs
  2014-03-27 17:20                                 ` [PATCH] LSM: Pass comm name via get_task_comm() [was: Re: [PATCH] Change task_struct->comm to use RCU.] Richard Guy Briggs
  1 sibling, 0 replies; 20+ messages in thread
From: Richard Guy Briggs @ 2014-03-24 15:43 UTC (permalink / raw)
  To: James Morris
  Cc: Tetsuo Handa, peterz, paulmck, laijs, akpm, joe, keescook, geert,
	jkosina, viro, davem, linux-kernel, mingo, rostedt, tglx,
	linux-security-module

On 14/03/12, James Morris wrote:
> On Tue, 11 Mar 2014, Tetsuo Handa wrote:
> > And the same phrase goes to James Morris...
> > 
> > If you are sure that it is safe to use get_task_comm() from
> > dump_common_audit_data() and you prefer locked version, please pick up below
> > patch via your git tree.
> > 
> > If you are unsure or prefer lockless version, I'll make a lockless version
> > using do_get_task_comm() proposed in this thread.
> 
> If you can't understand whether your patch is correct or not, don't ask me 
> to apply it to my tree.
> 
> If you're unsure, get it reviewed first.

I looked at the calling function tree back a number of steps and didn't
see any other evidence of task locks held, so I'm comfortable with this
patch and would prefer it to either memcpy() or do_get_task_comm(), but
would still like to get an opinion from others.

> - James

- RGB

--
Richard Guy Briggs <rbriggs@redhat.com>
Senior Software Engineer, Kernel Security, AMER ENG Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635, Alt: +1.613.693.0684x3545

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] LSM: Pass comm name via get_task_comm() [was: Re: [PATCH] Change task_struct->comm to use RCU.]
  2014-03-11 13:55                               ` James Morris
  2014-03-24 15:43                                 ` Richard Guy Briggs
@ 2014-03-27 17:20                                 ` Richard Guy Briggs
  2014-03-27 18:06                                   ` Stephen Smalley
  1 sibling, 1 reply; 20+ messages in thread
From: Richard Guy Briggs @ 2014-03-27 17:20 UTC (permalink / raw)
  To: James Morris, Steve Grubb, Eric Paris
  Cc: Tetsuo Handa, peterz, paulmck, laijs, akpm, joe, keescook, geert,
	jkosina, viro, davem, linux-kernel, mingo, rostedt, tglx,
	linux-security-module

On 14/03/12, James Morris wrote:
> On Tue, 11 Mar 2014, Tetsuo Handa wrote:
> 
> > And the same phrase goes to James Morris...
> > 
> > If you are sure that it is safe to use get_task_comm() from
> > dump_common_audit_data() and you prefer locked version, please pick up below
> > patch via your git tree.
> > 
> > If you are unsure or prefer lockless version, I'll make a lockless version
> > using do_get_task_comm() proposed in this thread.
> 
> If you can't understand whether your patch is correct or not, don't ask me 
> to apply it to my tree.
> 
> If you're unsure, get it reviewed first.

Steve (see https://lkml.org/lkml/2014/3/11/218 ) and James,

Are the labels on data output in LSM_AUDIT_DATA_TASK even right?  The
general case gives pid and comm of current.  Then the
LSM_AUDIT_DATA_TASK case gives pid and comm from the task handed in in
the struct common_audit_data pointer.  They are a duplicate of the
general case without generating a new message.  I expect this will cause
ausearch to ignore those latter two fields.  Should the latter two be
renamed to something like ad_pid= and ad_comm= ?

Tetsuo, this conversation should have been on the linux-audit@redhat.com
list the whole time...

> - James

- RGB

--
Richard Guy Briggs <rbriggs@redhat.com>
Senior Software Engineer, Kernel Security, AMER ENG Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635, Alt: +1.613.693.0684x3545

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] LSM: Pass comm name via get_task_comm() [was: Re: [PATCH] Change task_struct->comm to use RCU.]
  2014-03-27 17:20                                 ` [PATCH] LSM: Pass comm name via get_task_comm() [was: Re: [PATCH] Change task_struct->comm to use RCU.] Richard Guy Briggs
@ 2014-03-27 18:06                                   ` Stephen Smalley
  2014-09-19  3:30                                     ` Richard Guy Briggs
  0 siblings, 1 reply; 20+ messages in thread
From: Stephen Smalley @ 2014-03-27 18:06 UTC (permalink / raw)
  To: Richard Guy Briggs, James Morris, Steve Grubb, Eric Paris
  Cc: Tetsuo Handa, peterz, paulmck, laijs, akpm, joe, keescook, geert,
	jkosina, viro, davem, linux-kernel, mingo, rostedt, tglx,
	linux-security-module

On 03/27/2014 01:20 PM, Richard Guy Briggs wrote:
> On 14/03/12, James Morris wrote:
>> On Tue, 11 Mar 2014, Tetsuo Handa wrote:
>>
>>> And the same phrase goes to James Morris...
>>>
>>> If you are sure that it is safe to use get_task_comm() from
>>> dump_common_audit_data() and you prefer locked version, please pick up below
>>> patch via your git tree.
>>>
>>> If you are unsure or prefer lockless version, I'll make a lockless version
>>> using do_get_task_comm() proposed in this thread.
>>
>> If you can't understand whether your patch is correct or not, don't ask me 
>> to apply it to my tree.
>>
>> If you're unsure, get it reviewed first.
> 
> Steve (see https://lkml.org/lkml/2014/3/11/218 ) and James,
> 
> Are the labels on data output in LSM_AUDIT_DATA_TASK even right?  The
> general case gives pid and comm of current.  Then the
> LSM_AUDIT_DATA_TASK case gives pid and comm from the task handed in in
> the struct common_audit_data pointer.  They are a duplicate of the
> general case without generating a new message.  I expect this will cause
> ausearch to ignore those latter two fields.  Should the latter two be
> renamed to something like ad_pid= and ad_comm= ?

Hmmm..only seems to be used by Smack.
SELinux had a tsk field in common_audit_data that was removed by
b466066.  This other tsk field seems to have been added for Smack by
6e837fb.

That said, it would be nice to have pid/comm info for the target of a
signal check as well as current.



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] LSM: Pass comm name via get_task_comm() [was: Re: [PATCH] Change task_struct->comm to use RCU.]
  2014-03-27 18:06                                   ` Stephen Smalley
@ 2014-09-19  3:30                                     ` Richard Guy Briggs
  0 siblings, 0 replies; 20+ messages in thread
From: Richard Guy Briggs @ 2014-09-19  3:30 UTC (permalink / raw)
  To: Stephen Smalley
  Cc: James Morris, Steve Grubb, Eric Paris, Tetsuo Handa, peterz,
	paulmck, laijs, akpm, joe, keescook, geert, jkosina, viro, davem,
	linux-kernel, mingo, rostedt, tglx, linux-security-module

On 14/03/27, Stephen Smalley wrote:
> On 03/27/2014 01:20 PM, Richard Guy Briggs wrote:
> > On 14/03/12, James Morris wrote:
> >> On Tue, 11 Mar 2014, Tetsuo Handa wrote:
> >>
> >>> And the same phrase goes to James Morris...
> >>>
> >>> If you are sure that it is safe to use get_task_comm() from
> >>> dump_common_audit_data() and you prefer locked version, please pick up below
> >>> patch via your git tree.
> >>>
> >>> If you are unsure or prefer lockless version, I'll make a lockless version
> >>> using do_get_task_comm() proposed in this thread.
> >>
> >> If you can't understand whether your patch is correct or not, don't ask me 
> >> to apply it to my tree.
> >>
> >> If you're unsure, get it reviewed first.
> > 
> > Steve (see https://lkml.org/lkml/2014/3/11/218 ) and James,
> > 
> > Are the labels on data output in LSM_AUDIT_DATA_TASK even right?  The
> > general case gives pid and comm of current.  Then the
> > LSM_AUDIT_DATA_TASK case gives pid and comm from the task handed in in
> > the struct common_audit_data pointer.  They are a duplicate of the
> > general case without generating a new message.  I expect this will cause
> > ausearch to ignore those latter two fields.  Should the latter two be
> > renamed to something like ad_pid= and ad_comm= ?
> 
> Hmmm..only seems to be used by Smack.
> SELinux had a tsk field in common_audit_data that was removed by
> b466066.  This other tsk field seems to have been added for Smack by
> 6e837fb.
> 
> That said, it would be nice to have pid/comm info for the target of a
> signal check as well as current.

Reviving a bit of an old thread...

Probably the appropriate keywords would be opid= and ocomm= for the
target (object).

- RGB

--
Richard Guy Briggs <rbriggs@redhat.com>
Senior Software Engineer, Kernel Security, AMER ENG Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635, Alt: +1.613.693.0684x3545

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2014-09-19  3:31 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <201402072303.DJD13007.JFFMSLHOOFQOtV@I-love.SAKURA.ne.jp>
     [not found] ` <20140207140536.943daf965008b9428cdcb468@linux-foundation.org>
     [not found]   ` <201402081055.BGJ73403.tQMLFVOJSOOFFH@I-love.SAKURA.ne.jp>
     [not found]     ` <20140207180647.5944fe3d.akpm@linux-foundation.org>
     [not found]       ` <201402092327.JAD12489.QOLSFVMHJtFOOF@I-love.SAKURA.ne.jp>
2014-02-10 13:43         ` [PATCH (draft)] Change task_struct->comm to use RCU Tetsuo Handa
2014-02-17 11:27           ` [PATCH] " Tetsuo Handa
2014-02-24 23:51             ` Paul E. McKenney
2014-02-26 13:44               ` Tetsuo Handa
2014-02-26 15:26                 ` Paul E. McKenney
2014-02-25  1:49             ` Lai Jiangshan
2014-02-25 10:05               ` Peter Zijlstra
2014-02-25 12:54               ` Tetsuo Handa
2014-02-25 14:46                 ` Peter Zijlstra
2014-03-07 12:20                   ` Tetsuo Handa
2014-03-07 15:54                     ` Richard Guy Briggs
2014-03-08 12:43                       ` Tetsuo Handa
2014-03-10 20:21                         ` Richard Guy Briggs
2014-03-11 12:02                           ` Tetsuo Handa
2014-03-11 12:16                             ` Tetsuo Handa
2014-03-11 13:55                               ` James Morris
2014-03-24 15:43                                 ` Richard Guy Briggs
2014-03-27 17:20                                 ` [PATCH] LSM: Pass comm name via get_task_comm() [was: Re: [PATCH] Change task_struct->comm to use RCU.] Richard Guy Briggs
2014-03-27 18:06                                   ` Stephen Smalley
2014-09-19  3:30                                     ` Richard Guy Briggs

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.