* [PATCH 1/4] task_work: use the single-linked list to shrink sizeof(task_work)
2012-06-27 18:37 ` [PATCH 0/4] Was: " Oleg Nesterov
@ 2012-06-27 18:37 ` Oleg Nesterov
2012-06-27 18:37 ` [PATCH 2/4] task_work: don't rely on PF_EXITING Oleg Nesterov
` (3 subsequent siblings)
4 siblings, 0 replies; 54+ messages in thread
From: Oleg Nesterov @ 2012-06-27 18:37 UTC (permalink / raw)
To: Al Viro
Cc: Mimi Zohar, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
Change struct task_work to have a single forward pointer. The pending
task_work's create the circular list, and task->last_twork points to
the last element. This way we can access head/tail in O(1).
Suggested-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
include/linux/sched.h | 3 +-
include/linux/task_work.h | 5 +--
include/linux/tracehook.h | 4 +-
kernel/fork.c | 2 +-
kernel/task_work.c | 62 +++++++++++++++++++++++++-------------------
5 files changed, 42 insertions(+), 34 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4059c0f..5486299 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -101,6 +101,7 @@ struct bio_list;
struct fs_struct;
struct perf_event_context;
struct blk_plug;
+struct task_work;
/*
* List of flags we want to share for kernel threads,
@@ -1405,7 +1406,7 @@ struct task_struct {
int (*notifier)(void *priv);
void *notifier_data;
sigset_t *notifier_mask;
- struct hlist_head task_works;
+ struct task_work *last_twork;
struct audit_context *audit_context;
#ifdef CONFIG_AUDITSYSCALL
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 294d5d5..03640fb 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -1,14 +1,13 @@
#ifndef _LINUX_TASK_WORK_H
#define _LINUX_TASK_WORK_H
-#include <linux/list.h>
#include <linux/sched.h>
struct task_work;
typedef void (*task_work_func_t)(struct task_work *);
struct task_work {
- struct hlist_node hlist;
+ struct task_work *next;
task_work_func_t func;
void *data;
};
@@ -26,7 +25,7 @@ void task_work_run(void);
static inline void exit_task_work(struct task_struct *task)
{
- if (unlikely(!hlist_empty(&task->task_works)))
+ if (unlikely(task->last_twork))
task_work_run();
}
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 6a4d82b..89e88fe 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -189,10 +189,10 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
/*
* The caller just cleared TIF_NOTIFY_RESUME. This barrier
* pairs with task_work_add()->set_notify_resume() after
- * hlist_add_head(task->task_works);
+ * addition to task->last_twork list.
*/
smp_mb__after_clear_bit();
- if (unlikely(!hlist_empty(¤t->task_works)))
+ if (unlikely(current->last_twork))
task_work_run();
}
diff --git a/kernel/fork.c b/kernel/fork.c
index ab5211b..075000b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1415,7 +1415,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
*/
p->group_leader = p;
INIT_LIST_HEAD(&p->thread_group);
- INIT_HLIST_HEAD(&p->task_works);
+ p->last_twork = NULL;
/* Now that the task is set up, run cgroup callbacks if
* necessary. We need to run them before the task is visible
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 82d1c79..a0a3133 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -2,10 +2,15 @@
#include <linux/task_work.h>
#include <linux/tracehook.h>
+/*
+ * task->last_twork points to the last node in the circular single-linked list.
+ */
+
int
task_work_add(struct task_struct *task, struct task_work *twork, bool notify)
{
unsigned long flags;
+ struct task_work *last;
int err = -ESRCH;
#ifndef TIF_NOTIFY_RESUME
@@ -19,7 +24,10 @@ task_work_add(struct task_struct *task, struct task_work *twork, bool notify)
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
if (likely(!(task->flags & PF_EXITING))) {
- hlist_add_head(&twork->hlist, &task->task_works);
+ last = task->last_twork ?: twork;
+ task->last_twork = twork;
+ twork->next = last->next;
+ last->next = twork;
err = 0;
}
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
@@ -34,15 +42,26 @@ struct task_work *
task_work_cancel(struct task_struct *task, task_work_func_t func)
{
unsigned long flags;
- struct task_work *twork;
- struct hlist_node *pos;
+ struct task_work *last, *prev, *twork;
raw_spin_lock_irqsave(&task->pi_lock, flags);
- hlist_for_each_entry(twork, pos, &task->task_works, hlist) {
- if (twork->func == func) {
- hlist_del(&twork->hlist);
+ last = task->last_twork;
+ if (last) {
+ twork = last;
+ do {
+ prev = twork;
+ twork = twork->next;
+ if (twork->func != func)
+ continue;
+
+ prev->next = twork->next;
+ if (twork == last) {
+ if (prev == twork)
+ prev = NULL;
+ task->last_twork = prev;
+ }
goto found;
- }
+ } while (twork != last);
}
twork = NULL;
found:
@@ -54,31 +73,20 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
void task_work_run(void)
{
struct task_struct *task = current;
- struct hlist_head task_works;
- struct hlist_node *pos;
+ struct task_work *last, *next, *twork;
raw_spin_lock_irq(&task->pi_lock);
- hlist_move_list(&task->task_works, &task_works);
+ last = task->last_twork;
+ task->last_twork = NULL;
raw_spin_unlock_irq(&task->pi_lock);
- if (unlikely(hlist_empty(&task_works)))
+ if (unlikely(!last))
return;
- /*
- * We use hlist to save the space in task_struct, but we want fifo.
- * Find the last entry, the list should be short, then process them
- * in reverse order.
- */
- for (pos = task_works.first; pos->next; pos = pos->next)
- ;
- for (;;) {
- struct hlist_node **pprev = pos->pprev;
- struct task_work *twork = container_of(pos, struct task_work,
- hlist);
+ next = last->next;
+ do {
+ twork = next;
+ next = twork->next;
twork->func(twork);
-
- if (pprev == &task_works.first)
- break;
- pos = container_of(pprev, struct hlist_node, next);
- }
+ } while (twork != last);
}
--
1.5.5.1
^ permalink raw reply related [flat|nested] 54+ messages in thread
* [PATCH 2/4] task_work: don't rely on PF_EXITING
2012-06-27 18:37 ` [PATCH 0/4] Was: " Oleg Nesterov
2012-06-27 18:37 ` [PATCH 1/4] task_work: use the single-linked list to shrink sizeof(task_work) Oleg Nesterov
@ 2012-06-27 18:37 ` Oleg Nesterov
2012-06-27 18:38 ` [PATCH 3/4] task_work: deal with task_work callbacks adding more work Oleg Nesterov
` (2 subsequent siblings)
4 siblings, 0 replies; 54+ messages in thread
From: Oleg Nesterov @ 2012-06-27 18:37 UTC (permalink / raw)
To: Al Viro
Cc: Mimi Zohar, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
task_work_add() checks PF_EXITING to ensure it can't insert the new
twork after exit_task_work(). This means that the exiting task can
not use it after exit_signals().
Change task_work_add() and task_work_run() to use the fake
TWORK_EXITED pointer to synchronize with each other, now we can
move exit_task_work() down and make task_work_add() more useful.
Unfortunately, this means that the exiting task needs to take
pi_lock unconditionally.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
include/linux/task_work.h | 6 ++----
kernel/task_work.c | 15 ++++++++-------
2 files changed, 10 insertions(+), 11 deletions(-)
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 03640fb..ef70b01 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -1,9 +1,8 @@
#ifndef _LINUX_TASK_WORK_H
#define _LINUX_TASK_WORK_H
-#include <linux/sched.h>
-
struct task_work;
+struct task_struct;
typedef void (*task_work_func_t)(struct task_work *);
struct task_work {
@@ -25,8 +24,7 @@ void task_work_run(void);
static inline void exit_task_work(struct task_struct *task)
{
- if (unlikely(task->last_twork))
- task_work_run();
+ task_work_run();
}
#endif /* _LINUX_TASK_WORK_H */
diff --git a/kernel/task_work.c b/kernel/task_work.c
index a0a3133..b2ff3b7 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -6,6 +6,8 @@
* task->last_twork points to the last node in the circular single-linked list.
*/
+#define TWORK_EXITED ((struct task_work *)1)
+
int
task_work_add(struct task_struct *task, struct task_work *twork, bool notify)
{
@@ -18,12 +20,11 @@ task_work_add(struct task_struct *task, struct task_work *twork, bool notify)
return -ENOTSUPP;
#endif
/*
- * We must not insert the new work if the task has already passed
- * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait()
- * and check PF_EXITING under pi_lock.
+ * We must not insert the new work if the exiting task has already
+ * passed task_work_run().
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
- if (likely(!(task->flags & PF_EXITING))) {
+ if (likely(task->last_twork != TWORK_EXITED)) {
last = task->last_twork ?: twork;
task->last_twork = twork;
twork->next = last->next;
@@ -46,7 +47,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
raw_spin_lock_irqsave(&task->pi_lock, flags);
last = task->last_twork;
- if (last) {
+ if (last && last != TWORK_EXITED) {
twork = last;
do {
prev = twork;
@@ -77,10 +78,10 @@ void task_work_run(void)
raw_spin_lock_irq(&task->pi_lock);
last = task->last_twork;
- task->last_twork = NULL;
+ task->last_twork = (task->flags & PF_EXITING) ? TWORK_EXITED : NULL;
raw_spin_unlock_irq(&task->pi_lock);
- if (unlikely(!last))
+ if (!last)
return;
next = last->next;
--
1.5.5.1
^ permalink raw reply related [flat|nested] 54+ messages in thread
* [PATCH 3/4] task_work: deal with task_work callbacks adding more work
2012-06-27 18:37 ` [PATCH 0/4] Was: " Oleg Nesterov
2012-06-27 18:37 ` [PATCH 1/4] task_work: use the single-linked list to shrink sizeof(task_work) Oleg Nesterov
2012-06-27 18:37 ` [PATCH 2/4] task_work: don't rely on PF_EXITING Oleg Nesterov
@ 2012-06-27 18:38 ` Oleg Nesterov
2012-06-27 18:38 ` [PATCH 4/4] task_work: kill task_work->data Oleg Nesterov
2012-06-28 4:38 ` [PATCH 0/4] Was: deferring __fput() Al Viro
4 siblings, 0 replies; 54+ messages in thread
From: Oleg Nesterov @ 2012-06-27 18:38 UTC (permalink / raw)
To: Al Viro
Cc: Mimi Zohar, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
Shamelessly stolen Al Viro's patch and the changelog, just I didn't
dare to add the proper From tag.
It doesn't matter on normal return to userland path (we'll recheck
the NOTIFY_RESUME flag anyway), but in case of exit_task_work() we'll
need that as soon as we get callbacks capable of triggering more
task_work_add().
Suggested-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
kernel/task_work.c | 33 +++++++++++++++++++++------------
1 files changed, 21 insertions(+), 12 deletions(-)
diff --git a/kernel/task_work.c b/kernel/task_work.c
index b2ff3b7..8dfc48c 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -76,18 +76,27 @@ void task_work_run(void)
struct task_struct *task = current;
struct task_work *last, *next, *twork;
- raw_spin_lock_irq(&task->pi_lock);
- last = task->last_twork;
- task->last_twork = (task->flags & PF_EXITING) ? TWORK_EXITED : NULL;
- raw_spin_unlock_irq(&task->pi_lock);
+ for (;;) {
+ /*
+ * twork->func() can do task_work_add(), do not
+ * set TWORK_EXITED until the list becomes empty.
+ */
+ raw_spin_lock_irq(&task->pi_lock);
+ last = task->last_twork;
+ if (last)
+ task->last_twork = NULL;
+ else if (task->flags & PF_EXITING)
+ task->last_twork = TWORK_EXITED;
+ raw_spin_unlock_irq(&task->pi_lock);
- if (!last)
- return;
+ if (!last)
+ return;
- next = last->next;
- do {
- twork = next;
- next = twork->next;
- twork->func(twork);
- } while (twork != last);
+ next = last->next;
+ do {
+ twork = next;
+ next = twork->next;
+ twork->func(twork);
+ } while (twork != last);
+ }
}
--
1.5.5.1
^ permalink raw reply related [flat|nested] 54+ messages in thread
* [PATCH 4/4] task_work: kill task_work->data
2012-06-27 18:37 ` [PATCH 0/4] Was: " Oleg Nesterov
` (2 preceding siblings ...)
2012-06-27 18:38 ` [PATCH 3/4] task_work: deal with task_work callbacks adding more work Oleg Nesterov
@ 2012-06-27 18:38 ` Oleg Nesterov
2012-06-27 19:05 ` Oleg Nesterov
2012-06-28 4:38 ` [PATCH 0/4] Was: deferring __fput() Al Viro
4 siblings, 1 reply; 54+ messages in thread
From: Oleg Nesterov @ 2012-06-27 18:38 UTC (permalink / raw)
To: Al Viro
Cc: Mimi Zohar, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
Kill task_work->data, this makes sizeof(task_work) == sizeof(rcu_head).
Turn cred->rcu into rcu_head/task_work union for keyctl_session_to_parent().
Suggested-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
include/linux/cred.h | 6 +++++-
include/linux/task_work.h | 4 +---
kernel/irq/manage.c | 2 +-
security/keys/keyctl.c | 31 +++++++++++--------------------
security/keys/process_keys.c | 3 +--
5 files changed, 19 insertions(+), 27 deletions(-)
diff --git a/include/linux/cred.h b/include/linux/cred.h
index ebbed2c..a24c353 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -18,6 +18,7 @@
#include <linux/selinux.h>
#include <linux/atomic.h>
#include <linux/uidgid.h>
+#include <linux/task_work.h>
struct user_struct;
struct cred;
@@ -149,7 +150,10 @@ struct cred {
struct user_struct *user; /* real user ID subscription */
struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
struct group_info *group_info; /* supplementary groups for euid/fsgid */
- struct rcu_head rcu; /* RCU deletion hook */
+ union {
+ struct rcu_head rcu; /* RCU deletion hook */
+ struct task_work twork; /* keyctl_session_to_parent() */
+ };
};
extern void __put_cred(struct cred *);
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index ef70b01..09e470d 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -8,14 +8,12 @@ typedef void (*task_work_func_t)(struct task_work *);
struct task_work {
struct task_work *next;
task_work_func_t func;
- void *data;
};
static inline void
-init_task_work(struct task_work *twork, task_work_func_t func, void *data)
+init_task_work(struct task_work *twork, task_work_func_t func)
{
twork->func = func;
- twork->data = data;
}
int task_work_add(struct task_struct *task, struct task_work *twork, bool);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8c54823..d1dd547 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -830,7 +830,7 @@ static int irq_thread(void *data)
sched_setscheduler(current, SCHED_FIFO, ¶m);
- init_task_work(&on_exit_work, irq_thread_dtor, NULL);
+ init_task_work(&on_exit_work, irq_thread_dtor);
task_work_add(current, &on_exit_work, false);
while (!irq_wait_for_interrupt(action)) {
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 0f5b3f0..f221cab 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1456,8 +1456,8 @@ long keyctl_session_to_parent(void)
{
struct task_struct *me, *parent;
const struct cred *mycred, *pcred;
- struct task_work *newwork, *oldwork;
key_ref_t keyring_r;
+ struct task_work *oldwork;
struct cred *cred;
int ret;
@@ -1465,20 +1465,16 @@ long keyctl_session_to_parent(void)
if (IS_ERR(keyring_r))
return PTR_ERR(keyring_r);
- ret = -ENOMEM;
- newwork = kmalloc(sizeof(struct task_work), GFP_KERNEL);
- if (!newwork)
- goto error_keyring;
-
/* our parent is going to need a new cred struct, a new tgcred struct
* and new security data, so we allocate them here to prevent ENOMEM in
* our parent */
+ ret = -ENOMEM;
cred = cred_alloc_blank();
if (!cred)
- goto error_newwork;
+ goto error_keyring;
cred->tgcred->session_keyring = key_ref_to_ptr(keyring_r);
- init_task_work(newwork, key_change_session_keyring, cred);
+ init_task_work(&cred->twork, key_change_session_keyring);
me = current;
rcu_read_lock();
@@ -1527,24 +1523,19 @@ long keyctl_session_to_parent(void)
/* the replacement session keyring is applied just prior to userspace
* restarting */
- ret = task_work_add(parent, newwork, true);
+ ret = task_work_add(parent, &cred->twork, true);
if (!ret)
- newwork = NULL;
+ cred = NULL;
unlock:
write_unlock_irq(&tasklist_lock);
rcu_read_unlock();
- if (oldwork) {
- put_cred(oldwork->data);
- kfree(oldwork);
- }
- if (newwork) {
- put_cred(newwork->data);
- kfree(newwork);
- }
+
+ if (oldwork)
+ put_cred(container_of(oldwork, struct cred, twork));
+ if (cred)
+ put_cred(cred);
return ret;
-error_newwork:
- kfree(newwork);
error_keyring:
key_ref_put(keyring_r);
return ret;
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 4ad54ee..01303c2 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -837,9 +837,8 @@ error:
void key_change_session_keyring(struct task_work *twork)
{
const struct cred *old = current_cred();
- struct cred *new = twork->data;
+ struct cred *new = container_of(twork, struct cred, twork);
- kfree(twork);
if (unlikely(current->flags & PF_EXITING)) {
put_cred(new);
return;
--
1.5.5.1
^ permalink raw reply related [flat|nested] 54+ messages in thread
* Re: [PATCH 4/4] task_work: kill task_work->data
2012-06-27 18:38 ` [PATCH 4/4] task_work: kill task_work->data Oleg Nesterov
@ 2012-06-27 19:05 ` Oleg Nesterov
0 siblings, 0 replies; 54+ messages in thread
From: Oleg Nesterov @ 2012-06-27 19:05 UTC (permalink / raw)
To: Al Viro
Cc: Mimi Zohar, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On 06/27, Oleg Nesterov wrote:
>
> Turn cred->rcu into rcu_head/task_work union for keyctl_session_to_parent().
I tried to make the minimal patch, but perhaps with this change we
can also simplify the error handling in keyctl_session_to_parent()
a little bit.
We can do cred_alloc_blank() before lookup_user_key() and remove
error_keyring/key_ref_put.
IOW, on top of this patch:
--- x/security/keys/keyctl.c
+++ x/security/keys/keyctl.c
@@ -1461,17 +1461,18 @@ long keyctl_session_to_parent(void)
struct cred *cred;
int ret;
- keyring_r = lookup_user_key(KEY_SPEC_SESSION_KEYRING, 0, KEY_LINK);
- if (IS_ERR(keyring_r))
- return PTR_ERR(keyring_r);
-
/* our parent is going to need a new cred struct, a new tgcred struct
* and new security data, so we allocate them here to prevent ENOMEM in
* our parent */
- ret = -ENOMEM;
cred = cred_alloc_blank();
if (!cred)
- goto error_keyring;
+ return -ENOMEM;
+
+ keyring_r = lookup_user_key(KEY_SPEC_SESSION_KEYRING, 0, KEY_LINK);
+ if (IS_ERR(keyring_r)) {
+ ret = PTR_ERR(keyring_r);
+ goto free_cred;
+ }
cred->tgcred->session_keyring = key_ref_to_ptr(keyring_r);
init_task_work(&cred->twork, key_change_session_keyring);
@@ -1532,13 +1533,10 @@ unlock:
if (oldwork)
put_cred(container_of(oldwork, struct cred, twork));
+free_cred:
if (cred)
put_cred(cred);
return ret;
-
-error_keyring:
- key_ref_put(keyring_r);
- return ret;
}
/*
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-06-27 18:37 ` [PATCH 0/4] Was: " Oleg Nesterov
` (3 preceding siblings ...)
2012-06-27 18:38 ` [PATCH 4/4] task_work: kill task_work->data Oleg Nesterov
@ 2012-06-28 4:38 ` Al Viro
2012-06-28 16:22 ` Oleg Nesterov
2012-06-29 5:30 ` Mimi Zohar
4 siblings, 2 replies; 54+ messages in thread
From: Al Viro @ 2012-06-28 4:38 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Mimi Zohar, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Wed, Jun 27, 2012 at 08:37:21PM +0200, Oleg Nesterov wrote:
> On 06/25, Oleg Nesterov wrote:
> >
> > And if it always takes ->pi_lock we do not need the new PF_ or something
> > else, exit_task_work() can set task->task_works = NO_MORE under ->pi_lock
> > (task_work_run() can check PF_EXITING), and task_work_add() ensures that
> > task_works != NO_MORE.
> >
> > What do you think?
>
> It is not clear to me if you agree or not. So I am simply sending the
> patches I have.
>
> Feel free to ignore or re-do.
>
> Seriously, why should we add 2 pointers into task_struct? Sure, this
> is minor, but still... But perhaps task_work.c should not play tricks
> with the circular list, task_work_run() can reverse the list as you
> initially suggested.
>
> Also, I am not sure about "define rcu_head callback_head", this series
> doesn't do this. But again, up to you.
Umm... FWIW, my variant circa yesterday is in vfs.git#untested; it seems to survive
on uml/amd64 at least. I'll look through your patches and see what can be nicked.
The list removal logics in mine looks really ugly ;-/
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-06-28 4:38 ` [PATCH 0/4] Was: deferring __fput() Al Viro
@ 2012-06-28 16:22 ` Oleg Nesterov
2012-06-28 16:45 ` Oleg Nesterov
2012-06-29 5:30 ` Mimi Zohar
1 sibling, 1 reply; 54+ messages in thread
From: Oleg Nesterov @ 2012-06-28 16:22 UTC (permalink / raw)
To: Al Viro
Cc: Mimi Zohar, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On 06/28, Al Viro wrote:
>
> The list removal logics in mine looks really ugly ;-/
Basically the code is almost the same as mine.
But task_work_add() can be simplified a bit, no need to check
last != NULL twice.
last = task->task_works ?: twork;
task->last_twork = twork;
twork->next = last->next; /* XXX */
last->next = twork;
If ->task_works == NULL, then XXX is meaningless but safe.
Oleg.
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-06-28 16:22 ` Oleg Nesterov
@ 2012-06-28 16:45 ` Oleg Nesterov
2012-06-30 6:24 ` Al Viro
0 siblings, 1 reply; 54+ messages in thread
From: Oleg Nesterov @ 2012-06-28 16:45 UTC (permalink / raw)
To: Al Viro
Cc: Mimi Zohar, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
Forgot to mention...
And I still think that task_work_add() should not succeed unconditionally,
it synchronize with exit_task_work(). Otherwise keyctl_session_to_parent()
is broken.
On 06/28, Oleg Nesterov wrote:
>
> On 06/28, Al Viro wrote:
> >
> > The list removal logics in mine looks really ugly ;-/
>
> Basically the code is almost the same as mine.
>
> But task_work_add() can be simplified a bit, no need to check
> last != NULL twice.
>
> last = task->task_works ?: twork;
> task->last_twork = twork;
> twork->next = last->next; /* XXX */
> last->next = twork;
>
> If ->task_works == NULL, then XXX is meaningless but safe.
>
> Oleg.
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-06-28 16:45 ` Oleg Nesterov
@ 2012-06-30 6:24 ` Al Viro
2012-06-30 17:41 ` Oleg Nesterov
0 siblings, 1 reply; 54+ messages in thread
From: Al Viro @ 2012-06-30 6:24 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Mimi Zohar, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Thu, Jun 28, 2012 at 06:45:39PM +0200, Oleg Nesterov wrote:
> Forgot to mention...
>
> And I still think that task_work_add() should not succeed unconditionally,
> it synchronize with exit_task_work(). Otherwise keyctl_session_to_parent()
> is broken.
Hmm... Look: if nothing else, we have
/* the parent mustn't be init and mustn't be a kernel thread */
if (parent->pid <= 1 || !parent->mm)
goto unlock;
in the caller. OTOH, on the exit side we have exit_mm() done first. And
that will have ->mm set to NULL. So we are closing a very narrow race to start
with. So why not do the following and be done with that?
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 0291b3f..f1b59ae 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1486,6 +1486,7 @@ long keyctl_session_to_parent(void)
oldwork = NULL;
parent = me->real_parent;
+ task_lock(parent);
/* the parent mustn't be init and mustn't be a kernel thread */
if (parent->pid <= 1 || !parent->mm)
goto unlock;
@@ -1529,6 +1530,7 @@ long keyctl_session_to_parent(void)
if (!ret)
newwork = NULL;
unlock:
+ task_unlock(parent);
write_unlock_irq(&tasklist_lock);
rcu_read_unlock();
if (oldwork)
^ permalink raw reply related [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-06-30 6:24 ` Al Viro
@ 2012-06-30 17:41 ` Oleg Nesterov
0 siblings, 0 replies; 54+ messages in thread
From: Oleg Nesterov @ 2012-06-30 17:41 UTC (permalink / raw)
To: Al Viro
Cc: Mimi Zohar, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On 06/30, Al Viro wrote:
>
> On Thu, Jun 28, 2012 at 06:45:39PM +0200, Oleg Nesterov wrote:
> > Forgot to mention...
> >
> > And I still think that task_work_add() should not succeed unconditionally,
> > it synchronize with exit_task_work(). Otherwise keyctl_session_to_parent()
> > is broken.
>
> Hmm... Look: if nothing else, we have
> /* the parent mustn't be init and mustn't be a kernel thread */
> if (parent->pid <= 1 || !parent->mm)
> goto unlock;
> in the caller. OTOH, on the exit side we have exit_mm() done first. And
> that will have ->mm set to NULL. So we are closing a very narrow race to start
> with. So why not do the following and be done with that?
Of course we can fix keyctl_session_to_parent(). But why? I mean, why
do you dislike the idea to put this synchronization into add/run ?
IMO, this makes task_work much less useful/convenient. Every caller
has to fight with this race somehow. And the lock it can take depends
on the context, say, you can't use task_lock() in irq.
IOW, what is wrong with
[PATCH 2/4] task_work: don't rely on PF_EXITING
http://marc.info/?l=linux-kernel&m=134082265321691
and
[PATCH 3/4] task_work: deal with task_work callbacks adding more work
http://marc.info/?t=134082275400004
?
perhaps you do not like the fact that the exiting task takes pi_lock
unconditionally?
and in fact I think that probably it makes sense to change fput,
- task_work_add(current, ...);
+ BUG_ON(task_work_add(current, ...) != 0);
Oleg.
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-06-28 4:38 ` [PATCH 0/4] Was: deferring __fput() Al Viro
2012-06-28 16:22 ` Oleg Nesterov
@ 2012-06-29 5:30 ` Mimi Zohar
2012-06-29 8:33 ` Al Viro
1 sibling, 1 reply; 54+ messages in thread
From: Mimi Zohar @ 2012-06-29 5:30 UTC (permalink / raw)
To: Al Viro
Cc: Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Thu, 2012-06-28 at 05:38 +0100, Al Viro wrote:
> On Wed, Jun 27, 2012 at 08:37:21PM +0200, Oleg Nesterov wrote:
> > On 06/25, Oleg Nesterov wrote:
> > >
> > > And if it always takes ->pi_lock we do not need the new PF_ or something
> > > else, exit_task_work() can set task->task_works = NO_MORE under ->pi_lock
> > > (task_work_run() can check PF_EXITING), and task_work_add() ensures that
> > > task_works != NO_MORE.
> > >
> > > What do you think?
> >
> > It is not clear to me if you agree or not. So I am simply sending the
> > patches I have.
> >
> > Feel free to ignore or re-do.
> >
> > Seriously, why should we add 2 pointers into task_struct? Sure, this
> > is minor, but still... But perhaps task_work.c should not play tricks
> > with the circular list, task_work_run() can reverse the list as you
> > initially suggested.
> >
> > Also, I am not sure about "define rcu_head callback_head", this series
> > doesn't do this. But again, up to you.
>
> Umm... FWIW, my variant circa yesterday is in vfs.git#untested; it seems to survive
> on uml/amd64 at least. I'll look through your patches and see what can be nicked.
> The list removal logics in mine looks really ugly ;-/
Still failing to boot. Fails to boot starting with commit "b24dfa6
switch fput to task_work_add".
Mimi
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-06-29 5:30 ` Mimi Zohar
@ 2012-06-29 8:33 ` Al Viro
2012-06-29 13:02 ` Mimi Zohar
0 siblings, 1 reply; 54+ messages in thread
From: Al Viro @ 2012-06-29 8:33 UTC (permalink / raw)
To: Mimi Zohar
Cc: Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Fri, Jun 29, 2012 at 01:30:38AM -0400, Mimi Zohar wrote:
> On Thu, 2012-06-28 at 05:38 +0100, Al Viro wrote:
> > On Wed, Jun 27, 2012 at 08:37:21PM +0200, Oleg Nesterov wrote:
> > > On 06/25, Oleg Nesterov wrote:
> > > >
> > > > And if it always takes ->pi_lock we do not need the new PF_ or something
> > > > else, exit_task_work() can set task->task_works = NO_MORE under ->pi_lock
> > > > (task_work_run() can check PF_EXITING), and task_work_add() ensures that
> > > > task_works != NO_MORE.
> > > >
> > > > What do you think?
> > >
> > > It is not clear to me if you agree or not. So I am simply sending the
> > > patches I have.
> > >
> > > Feel free to ignore or re-do.
> > >
> > > Seriously, why should we add 2 pointers into task_struct? Sure, this
> > > is minor, but still... But perhaps task_work.c should not play tricks
> > > with the circular list, task_work_run() can reverse the list as you
> > > initially suggested.
> > >
> > > Also, I am not sure about "define rcu_head callback_head", this series
> > > doesn't do this. But again, up to you.
> >
> > Umm... FWIW, my variant circa yesterday is in vfs.git#untested; it seems to survive
> > on uml/amd64 at least. I'll look through your patches and see what can be nicked.
> > The list removal logics in mine looks really ugly ;-/
>
> Still failing to boot. Fails to boot starting with commit "b24dfa6
> switch fput to task_work_add".
Details, please... .config, root fs type, etc. Failed execve() of init is, unfortunately,
not too informative...
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-06-29 8:33 ` Al Viro
@ 2012-06-29 13:02 ` Mimi Zohar
2012-06-29 17:41 ` Al Viro
0 siblings, 1 reply; 54+ messages in thread
From: Mimi Zohar @ 2012-06-29 13:02 UTC (permalink / raw)
To: Al Viro
Cc: Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Fri, 2012-06-29 at 09:33 +0100, Al Viro wrote:
> On Fri, Jun 29, 2012 at 01:30:38AM -0400, Mimi Zohar wrote:
> > On Thu, 2012-06-28 at 05:38 +0100, Al Viro wrote:
> > > On Wed, Jun 27, 2012 at 08:37:21PM +0200, Oleg Nesterov wrote:
> > > > On 06/25, Oleg Nesterov wrote:
> > > > >
> > > > > And if it always takes ->pi_lock we do not need the new PF_ or something
> > > > > else, exit_task_work() can set task->task_works = NO_MORE under ->pi_lock
> > > > > (task_work_run() can check PF_EXITING), and task_work_add() ensures that
> > > > > task_works != NO_MORE.
> > > > >
> > > > > What do you think?
> > > >
> > > > It is not clear to me if you agree or not. So I am simply sending the
> > > > patches I have.
> > > >
> > > > Feel free to ignore or re-do.
> > > >
> > > > Seriously, why should we add 2 pointers into task_struct? Sure, this
> > > > is minor, but still... But perhaps task_work.c should not play tricks
> > > > with the circular list, task_work_run() can reverse the list as you
> > > > initially suggested.
> > > >
> > > > Also, I am not sure about "define rcu_head callback_head", this series
> > > > doesn't do this. But again, up to you.
> > >
> > > Umm... FWIW, my variant circa yesterday is in vfs.git#untested; it seems to survive
> > > on uml/amd64 at least. I'll look through your patches and see what can be nicked.
> > > The list removal logics in mine looks really ugly ;-/
> >
> > Still failing to boot. Fails to boot starting with commit "b24dfa6
> > switch fput to task_work_add".
>
> Details, please... .config, root fs type, etc. Failed execve() of init is, unfortunately,
> not too informative...
In addition to failing with my tailored .config (eg. IMA, TPM builtin),
it fails with config-3.4.2-1.fc16.x86_64, with new default options,
running on a Lenovo W520.
/dev/sda7 on / type ext4 (rw,relatime,seclabel,i_version,data=ordered)
Mimi
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-06-29 13:02 ` Mimi Zohar
@ 2012-06-29 17:41 ` Al Viro
2012-06-29 21:38 ` Mimi Zohar
0 siblings, 1 reply; 54+ messages in thread
From: Al Viro @ 2012-06-29 17:41 UTC (permalink / raw)
To: Mimi Zohar
Cc: Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Fri, Jun 29, 2012 at 09:02:05AM -0400, Mimi Zohar wrote:
> In addition to failing with my tailored .config (eg. IMA, TPM builtin),
> it fails with config-3.4.2-1.fc16.x86_64, with new default options,
> running on a Lenovo W520.
>
> /dev/sda7 on / type ext4 (rw,relatime,seclabel,i_version,data=ordered)
Lovely... Smells like something making dracut unhappy. Does it say
anything interesting when booting with rdinitdebug in command line?
What happens with rdshell in there?
I don't have easily accessible Fedora boxen at the moment and it does
look like something fishy going on while initramfs stuff runs...
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-06-29 17:41 ` Al Viro
@ 2012-06-29 21:38 ` Mimi Zohar
2012-06-29 23:56 ` Mimi Zohar
0 siblings, 1 reply; 54+ messages in thread
From: Mimi Zohar @ 2012-06-29 21:38 UTC (permalink / raw)
To: Al Viro
Cc: Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Fri, 2012-06-29 at 18:41 +0100, Al Viro wrote:
> On Fri, Jun 29, 2012 at 09:02:05AM -0400, Mimi Zohar wrote:
>
> > In addition to failing with my tailored .config (eg. IMA, TPM builtin),
> > it fails with config-3.4.2-1.fc16.x86_64, with new default options,
> > running on a Lenovo W520.
> >
> > /dev/sda7 on / type ext4 (rw,relatime,seclabel,i_version,data=ordered)
>
> Lovely... Smells like something making dracut unhappy. Does it say
> anything interesting when booting with rdinitdebug in command line?
> What happens with rdshell in there?
>
> I don't have easily accessible Fedora boxen at the moment and it does
> look like something fishy going on while initramfs stuff runs...
No difference with rdinitdebug, rdshell, but the message
"ata5: SATA link down (SStatus 0 SControl 300)" prior to "Freeing unused
kernel memory... ", "Failed to execute /init" and the trace back,
probably helps. Don't know how I missed that.
Mimi
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-06-29 21:38 ` Mimi Zohar
@ 2012-06-29 23:56 ` Mimi Zohar
2012-06-30 5:02 ` Al Viro
0 siblings, 1 reply; 54+ messages in thread
From: Mimi Zohar @ 2012-06-29 23:56 UTC (permalink / raw)
To: Al Viro
Cc: Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Fri, 2012-06-29 at 17:38 -0400, Mimi Zohar wrote:
> On Fri, 2012-06-29 at 18:41 +0100, Al Viro wrote:
> > On Fri, Jun 29, 2012 at 09:02:05AM -0400, Mimi Zohar wrote:
> >
> > > In addition to failing with my tailored .config (eg. IMA, TPM builtin),
> > > it fails with config-3.4.2-1.fc16.x86_64, with new default options,
> > > running on a Lenovo W520.
> > >
> > > /dev/sda7 on / type ext4 (rw,relatime,seclabel,i_version,data=ordered)
> >
> > Lovely... Smells like something making dracut unhappy. Does it say
> > anything interesting when booting with rdinitdebug in command line?
> > What happens with rdshell in there?
> >
> > I don't have easily accessible Fedora boxen at the moment and it does
> > look like something fishy going on while initramfs stuff runs...
>
> No difference with rdinitdebug, rdshell, but the message
> "ata5: SATA link down (SStatus 0 SControl 300)" prior to "Freeing unused
> kernel memory... ", "Failed to execute /init" and the trace back,
> probably helps. Don't know how I missed that.
Looking at /var/log/messages, seems like the ata4 and ata5 "SATA link
down (SStatus 0 SControl 300)" messages are normal.
ata5: SATA link down (SStatus 0 SControl 300)
Freeing unused kernel memory: 1016k freed
Write protecting the kernel read-only data: 12288k
Freeing unused kernel memory: 1964k freed
Freeing unused kernel memory: 1468k freed
Failed to execute /init
Kernel panic - not syncing. No init Found. Try passing init= option ...
Pid: 1, comm: swapper/0 not tainted 3.5.0-rc1+
Call Trace:
panic
init_post
kernel_init
?do_early_param
kernel_thread_helper
start_kernel
Mimi
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-06-29 23:56 ` Mimi Zohar
@ 2012-06-30 5:02 ` Al Viro
2012-07-01 19:50 ` Mimi Zohar
0 siblings, 1 reply; 54+ messages in thread
From: Al Viro @ 2012-06-30 5:02 UTC (permalink / raw)
To: Mimi Zohar
Cc: Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Fri, Jun 29, 2012 at 07:56:37PM -0400, Mimi Zohar wrote:
> Looking at /var/log/messages, seems like the ata4 and ata5 "SATA link
> down (SStatus 0 SControl 300)" messages are normal.
>
> ata5: SATA link down (SStatus 0 SControl 300)
> Freeing unused kernel memory: 1016k freed
> Write protecting the kernel read-only data: 12288k
> Freeing unused kernel memory: 1964k freed
> Freeing unused kernel memory: 1468k freed
> Failed to execute /init
> Kernel panic - not syncing. No init Found. Try passing init= option ...
> Pid: 1, comm: swapper/0 not tainted 3.5.0-rc1+
> Call Trace:
> panic
> init_post
> kernel_init
> ?do_early_param
> kernel_thread_helper
> start_kernel
Just to make sure - you are not getting IMA violations among all that? AFAICS,
the damn thing should behave no worse in that respect than your own patch
a while ago, and you haven't mentioned them in this thread, but...
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-06-30 5:02 ` Al Viro
@ 2012-07-01 19:50 ` Mimi Zohar
2012-07-01 20:57 ` Al Viro
0 siblings, 1 reply; 54+ messages in thread
From: Mimi Zohar @ 2012-07-01 19:50 UTC (permalink / raw)
To: Al Viro
Cc: Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Sat, 2012-06-30 at 06:02 +0100, Al Viro wrote:
> On Fri, Jun 29, 2012 at 07:56:37PM -0400, Mimi Zohar wrote:
> > Looking at /var/log/messages, seems like the ata4 and ata5 "SATA link
> > down (SStatus 0 SControl 300)" messages are normal.
> >
> > ata5: SATA link down (SStatus 0 SControl 300)
> > Freeing unused kernel memory: 1016k freed
> > Write protecting the kernel read-only data: 12288k
> > Freeing unused kernel memory: 1964k freed
> > Freeing unused kernel memory: 1468k freed
> > Failed to execute /init
> > Kernel panic - not syncing. No init Found. Try passing init= option ...
> > Pid: 1, comm: swapper/0 not tainted 3.5.0-rc1+
> > Call Trace:
> > panic
> > init_post
> > kernel_init
> > ?do_early_param
> > kernel_thread_helper
> > start_kernel
>
> Just to make sure - you are not getting IMA violations among all that?
I'm not running with the IMA-appraisal patches, nor does the
Fedora .config enable IMA. So I'm not getting violations.
> AFAICS,
> the damn thing should behave no worse in that respect than your own patch
> a while ago, and you haven't mentioned them in this thread, but...
I haven't mentioned the "ima: defer calling __fput()" patch, since I've
compiled git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git
#untested with a .config based on config-3.4.2-1.fc16.x86_64 and am
having this problem. No need to add more confusion. The "ima: defer
calling __fput()" will be dropped from the patchset, as soon as the
general method works.
I've isolated the problem to the PF_KTHREAD section of fput().
void fput(struct file *file)
{
if (atomic_long_dec_and_test(&file->f_count)) {
struct task_struct *task = current;
file_sb_list_del(file);
if (unlikely(in_interrupt() || task->flags & PF_KTHREAD)) {
unsigned long flags;
spin_lock_irqsave(&delayed_fput_lock, flags);
list_add(&file->f_u.fu_list, &delayed_fput_list);
schedule_work(&delayed_fput_work);
spin_unlock_irqrestore(&delayed_fput_lock, flags);
return;
}
init_task_work(&file->f_u.fu_rcuhead, ____fput);
task_work_add(task, &file->f_u.fu_rcuhead, true);
}
}
Replacing it with a call to __fput(), the system boots.
thanks,
Mimi
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-07-01 19:50 ` Mimi Zohar
@ 2012-07-01 20:57 ` Al Viro
2012-07-02 1:46 ` Mimi Zohar
0 siblings, 1 reply; 54+ messages in thread
From: Al Viro @ 2012-07-01 20:57 UTC (permalink / raw)
To: Mimi Zohar
Cc: Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Sun, Jul 01, 2012 at 03:50:02PM -0400, Mimi Zohar wrote:
>
> I haven't mentioned the "ima: defer calling __fput()" patch, since I've
> compiled git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git
> #untested with a .config based on config-3.4.2-1.fc16.x86_64 and am
> having this problem. No need to add more confusion. The "ima: defer
> calling __fput()" will be dropped from the patchset, as soon as the
> general method works.
>
> I've isolated the problem to the PF_KTHREAD section of fput().
>
> void fput(struct file *file)
> {
> if (atomic_long_dec_and_test(&file->f_count)) {
> struct task_struct *task = current;
> file_sb_list_del(file);
> if (unlikely(in_interrupt() || task->flags & PF_KTHREAD)) {
> unsigned long flags;
> spin_lock_irqsave(&delayed_fput_lock, flags);
> list_add(&file->f_u.fu_list, &delayed_fput_list);
> schedule_work(&delayed_fput_work);
> spin_unlock_irqrestore(&delayed_fput_lock, flags);
> return;
> }
> init_task_work(&file->f_u.fu_rcuhead, ____fput);
> task_work_add(task, &file->f_u.fu_rcuhead, true);
> }
> }
>
> Replacing it with a call to __fput(), the system boots.
"it" being just the part under that if (unlikely(...)))? Very interesting... If so, we
have some kernel thread ending up with delayed __fput() which somehow makes dracut (assuimg
you are using fedora initramfs to go with fedora config) unhappy. With your own patch,
doing async __fput() in a lot of cases when this one doesn't delay past the return to
userland managing to survive the boot... I wonder which files end up triggering that fun
and which kernel thread is responsible... Could you slap a printk() in there, showing
file->f_dentry->d_inode->i_mode (octal) and at least file->f_dentry->d_name.name?
Along with the current->comm[], all under that inner if (). And see which ones end up
going that way by the time execve() of /sbin/init fails.
It would be nice to see which sys_mount() calls are made and which (if any) fail, BTW.
I wonder if it even gets to mounting the right root...
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-07-01 20:57 ` Al Viro
@ 2012-07-02 1:46 ` Mimi Zohar
2012-07-02 3:43 ` Al Viro
0 siblings, 1 reply; 54+ messages in thread
From: Mimi Zohar @ 2012-07-02 1:46 UTC (permalink / raw)
To: Al Viro
Cc: Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Sun, 2012-07-01 at 21:57 +0100, Al Viro wrote:
> On Sun, Jul 01, 2012 at 03:50:02PM -0400, Mimi Zohar wrote:
> > Replacing it with a call to __fput(), the system boots.
>
> "it" being just the part under that if (unlikely(...)))? Very interesting... If so, we
> have some kernel thread ending up with delayed __fput() which somehow makes dracut (assuimg
> you are using fedora initramfs to go with fedora config) unhappy. With your own patch,
> doing async __fput() in a lot of cases when this one doesn't delay past the return to
> userland managing to survive the boot... I wonder which files end up triggering that fun
> and which kernel thread is responsible... Could you slap a printk() in there, showing
> file->f_dentry->d_inode->i_mode (octal) and at least file->f_dentry->d_name.name?
> Along with the current->comm[], all under that inner if (). And see which ones end up
> going that way by the time execve() of /sbin/init fails.
pid=1 uid=0 d_name=init comm=swapper/0 dev="rootfs" mode=100775
pid=1 uid=0 d_name=bash comm=swapper/0 dev="rootfs" mode=100755
> It would be nice to see which sys_mount() calls are made and which (if any) fail, BTW.
> I wonder if it even gets to mounting the right root...
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-07-02 1:46 ` Mimi Zohar
@ 2012-07-02 3:43 ` Al Viro
2012-07-02 5:11 ` Al Viro
0 siblings, 1 reply; 54+ messages in thread
From: Al Viro @ 2012-07-02 3:43 UTC (permalink / raw)
To: Mimi Zohar
Cc: Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Sun, Jul 01, 2012 at 09:46:31PM -0400, Mimi Zohar wrote:
> On Sun, 2012-07-01 at 21:57 +0100, Al Viro wrote:
> > On Sun, Jul 01, 2012 at 03:50:02PM -0400, Mimi Zohar wrote:
> > > Replacing it with a call to __fput(), the system boots.
> >
> > "it" being just the part under that if (unlikely(...)))? Very interesting... If so, we
> > have some kernel thread ending up with delayed __fput() which somehow makes dracut (assuimg
> > you are using fedora initramfs to go with fedora config) unhappy. With your own patch,
> > doing async __fput() in a lot of cases when this one doesn't delay past the return to
> > userland managing to survive the boot... I wonder which files end up triggering that fun
> > and which kernel thread is responsible... Could you slap a printk() in there, showing
> > file->f_dentry->d_inode->i_mode (octal) and at least file->f_dentry->d_name.name?
> > Along with the current->comm[], all under that inner if (). And see which ones end up
> > going that way by the time execve() of /sbin/init fails.
>
> pid=1 uid=0 d_name=init comm=swapper/0 dev="rootfs" mode=100775
> pid=1 uid=0 d_name=bash comm=swapper/0 dev="rootfs" mode=100755
OK... Here's what I suspect is going on:
* populating initramfs writes binaries there. We open files (for write) from
the kernel thread (there's nothing other than kernel threads at that point), write to
them, then close(). Final fput() gets delayed.
* Then we proceed to execve(). Which means mapping the binary with MAP_DENYWRITE.
Which fails, since there's a struct file still opened for write on that sucker.
Your patch did not delay those fput() - they were done without ->mmap_sem held. So
it survived. Booting without initramfs always survives; booting with initramfs may
or may not survive, depending on the timings - if that scheduled work manages to
run by the time we do those execve(), we win. Note that async_synchronize_full()
done in init_post() might easily affect that, depending on config.
As a quick test, could you try slapping a delay somewhere around the beginning
of init_post() and see if it rescues the system?
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-07-02 3:43 ` Al Viro
@ 2012-07-02 5:11 ` Al Viro
2012-07-02 11:49 ` Mimi Zohar
0 siblings, 1 reply; 54+ messages in thread
From: Al Viro @ 2012-07-02 5:11 UTC (permalink / raw)
To: Mimi Zohar
Cc: Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Mon, Jul 02, 2012 at 04:43:10AM +0100, Al Viro wrote:
> On Sun, Jul 01, 2012 at 09:46:31PM -0400, Mimi Zohar wrote:
> > On Sun, 2012-07-01 at 21:57 +0100, Al Viro wrote:
> > > On Sun, Jul 01, 2012 at 03:50:02PM -0400, Mimi Zohar wrote:
> > > > Replacing it with a call to __fput(), the system boots.
> > >
> > > "it" being just the part under that if (unlikely(...)))? Very interesting... If so, we
> > > have some kernel thread ending up with delayed __fput() which somehow makes dracut (assuimg
> > > you are using fedora initramfs to go with fedora config) unhappy. With your own patch,
> > > doing async __fput() in a lot of cases when this one doesn't delay past the return to
> > > userland managing to survive the boot... I wonder which files end up triggering that fun
> > > and which kernel thread is responsible... Could you slap a printk() in there, showing
> > > file->f_dentry->d_inode->i_mode (octal) and at least file->f_dentry->d_name.name?
> > > Along with the current->comm[], all under that inner if (). And see which ones end up
> > > going that way by the time execve() of /sbin/init fails.
> >
> > pid=1 uid=0 d_name=init comm=swapper/0 dev="rootfs" mode=100775
> > pid=1 uid=0 d_name=bash comm=swapper/0 dev="rootfs" mode=100755
>
> OK... Here's what I suspect is going on:
> * populating initramfs writes binaries there. We open files (for write) from
> the kernel thread (there's nothing other than kernel threads at that point), write to
> them, then close(). Final fput() gets delayed.
> * Then we proceed to execve(). Which means mapping the binary with MAP_DENYWRITE.
> Which fails, since there's a struct file still opened for write on that sucker.
>
> Your patch did not delay those fput() - they were done without ->mmap_sem held. So
> it survived. Booting without initramfs always survives; booting with initramfs may
> or may not survive, depending on the timings - if that scheduled work manages to
> run by the time we do those execve(), we win. Note that async_synchronize_full()
> done in init_post() might easily affect that, depending on config.
>
> As a quick test, could you try slapping a delay somewhere around the beginning
> of init_post() and see if it rescues the system?
Ho-hum... How about this (modulo missing documentation of the whole sad mess):
diff --git a/fs/file_table.c b/fs/file_table.c
index 470da0b..00fd849 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -284,6 +284,11 @@ static void ____fput(struct callback_head *work)
__fput(container_of(work, struct file, f_u.fu_rcuhead));
}
+void flush_delayed_fput(void)
+{
+ delayed_fput(NULL);
+}
+
static DECLARE_WORK(delayed_fput_work, delayed_fput);
void fput(struct file *file)
diff --git a/include/linux/file.h b/include/linux/file.h
index 58bf158..d9a4f5a 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -39,4 +39,6 @@ extern void put_unused_fd(unsigned int fd);
extern void fd_install(unsigned int fd, struct file *file);
+extern void flush_delayed_fput(void);
+
#endif /* __LINUX_FILE_H */
diff --git a/init/main.c b/init/main.c
index b5cc0a7..3f151f6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -68,6 +68,7 @@
#include <linux/shmem_fs.h>
#include <linux/slab.h>
#include <linux/perf_event.h>
+#include <linux/file.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -804,8 +805,8 @@ static noinline int init_post(void)
system_state = SYSTEM_RUNNING;
numa_default_policy();
-
current->signal->flags |= SIGNAL_UNKILLABLE;
+ flush_delayed_fput();
if (ramdisk_execute_command) {
run_init_process(ramdisk_execute_command);
^ permalink raw reply related [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-07-02 5:11 ` Al Viro
@ 2012-07-02 11:49 ` Mimi Zohar
2012-07-02 12:02 ` Al Viro
0 siblings, 1 reply; 54+ messages in thread
From: Mimi Zohar @ 2012-07-02 11:49 UTC (permalink / raw)
To: Al Viro
Cc: Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Mon, 2012-07-02 at 06:11 +0100, Al Viro wrote:
> On Mon, Jul 02, 2012 at 04:43:10AM +0100, Al Viro wrote:
> > On Sun, Jul 01, 2012 at 09:46:31PM -0400, Mimi Zohar wrote:
> > > On Sun, 2012-07-01 at 21:57 +0100, Al Viro wrote:
> > > > On Sun, Jul 01, 2012 at 03:50:02PM -0400, Mimi Zohar wrote:
> > > > > Replacing it with a call to __fput(), the system boots.
> > > >
> > > > "it" being just the part under that if (unlikely(...)))? Very interesting... If so, we
> > > > have some kernel thread ending up with delayed __fput() which somehow makes dracut (assuimg
> > > > you are using fedora initramfs to go with fedora config) unhappy. With your own patch,
> > > > doing async __fput() in a lot of cases when this one doesn't delay past the return to
> > > > userland managing to survive the boot... I wonder which files end up triggering that fun
> > > > and which kernel thread is responsible... Could you slap a printk() in there, showing
> > > > file->f_dentry->d_inode->i_mode (octal) and at least file->f_dentry->d_name.name?
> > > > Along with the current->comm[], all under that inner if (). And see which ones end up
> > > > going that way by the time execve() of /sbin/init fails.
> > >
> > > pid=1 uid=0 d_name=init comm=swapper/0 dev="rootfs" mode=100775
> > > pid=1 uid=0 d_name=bash comm=swapper/0 dev="rootfs" mode=100755
> >
> > OK... Here's what I suspect is going on:
> > * populating initramfs writes binaries there. We open files (for write) from
> > the kernel thread (there's nothing other than kernel threads at that point), write to
> > them, then close(). Final fput() gets delayed.
> > * Then we proceed to execve(). Which means mapping the binary with MAP_DENYWRITE.
> > Which fails, since there's a struct file still opened for write on that sucker.
> >
> > Your patch did not delay those fput() - they were done without ->mmap_sem held. So
> > it survived. Booting without initramfs always survives; booting with initramfs may
> > or may not survive, depending on the timings - if that scheduled work manages to
> > run by the time we do those execve(), we win. Note that async_synchronize_full()
> > done in init_post() might easily affect that, depending on config.
> >
> > As a quick test, could you try slapping a delay somewhere around the beginning
> > of init_post() and see if it rescues the system?
>
> Ho-hum... How about this (modulo missing documentation of the whole sad mess):
Sorry, neither adding the delay or this patch helped.
> diff --git a/fs/file_table.c b/fs/file_table.c
> index 470da0b..00fd849 100644
> --- a/fs/file_table.c
> +++ b/fs/file_table.c
> @@ -284,6 +284,11 @@ static void ____fput(struct callback_head *work)
> __fput(container_of(work, struct file, f_u.fu_rcuhead));
> }
>
> +void flush_delayed_fput(void)
> +{
> + delayed_fput(NULL);
> +}
> +
> static DECLARE_WORK(delayed_fput_work, delayed_fput);
>
> void fput(struct file *file)
> diff --git a/include/linux/file.h b/include/linux/file.h
> index 58bf158..d9a4f5a 100644
> --- a/include/linux/file.h
> +++ b/include/linux/file.h
> @@ -39,4 +39,6 @@ extern void put_unused_fd(unsigned int fd);
>
> extern void fd_install(unsigned int fd, struct file *file);
>
> +extern void flush_delayed_fput(void);
> +
> #endif /* __LINUX_FILE_H */
> diff --git a/init/main.c b/init/main.c
> index b5cc0a7..3f151f6 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -68,6 +68,7 @@
> #include <linux/shmem_fs.h>
> #include <linux/slab.h>
> #include <linux/perf_event.h>
> +#include <linux/file.h>
>
> #include <asm/io.h>
> #include <asm/bugs.h>
> @@ -804,8 +805,8 @@ static noinline int init_post(void)
> system_state = SYSTEM_RUNNING;
> numa_default_policy();
>
> -
> current->signal->flags |= SIGNAL_UNKILLABLE;
> + flush_delayed_fput();
>
> if (ramdisk_execute_command) {
> run_init_process(ramdisk_execute_command);
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-07-02 11:49 ` Mimi Zohar
@ 2012-07-02 12:02 ` Al Viro
2012-07-02 13:01 ` Mimi Zohar
0 siblings, 1 reply; 54+ messages in thread
From: Al Viro @ 2012-07-02 12:02 UTC (permalink / raw)
To: Mimi Zohar
Cc: Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Mon, Jul 02, 2012 at 07:49:50AM -0400, Mimi Zohar wrote:
> > > > pid=1 uid=0 d_name=init comm=swapper/0 dev="rootfs" mode=100775
> > > > pid=1 uid=0 d_name=bash comm=swapper/0 dev="rootfs" mode=100755
> > >
> > > OK... Here's what I suspect is going on:
> > > * populating initramfs writes binaries there. We open files (for write) from
> > > the kernel thread (there's nothing other than kernel threads at that point), write to
> > > them, then close(). Final fput() gets delayed.
> > > * Then we proceed to execve(). Which means mapping the binary with MAP_DENYWRITE.
> > > Which fails, since there's a struct file still opened for write on that sucker.
> > >
> > > Your patch did not delay those fput() - they were done without ->mmap_sem held. So
> > > it survived. Booting without initramfs always survives; booting with initramfs may
> > > or may not survive, depending on the timings - if that scheduled work manages to
> > > run by the time we do those execve(), we win. Note that async_synchronize_full()
> > > done in init_post() might easily affect that, depending on config.
> > >
> > > As a quick test, could you try slapping a delay somewhere around the beginning
> > > of init_post() and see if it rescues the system?
> >
> > Ho-hum... How about this (modulo missing documentation of the whole sad mess):
>
> Sorry, neither adding the delay or this patch helped.
Really odd. Could you print the error returned by kernel_execve() in run_init_process()?
At least that way we'll get some indication of what's going on there. Another thing:
could you slap matching printks into the nested if() in fput() and the loop in
delayed_fput(), just to see if we do get __fput() done on all the right struct file?
Just "fput: %p", file and "delayed_fput: %p", file would probably be enough.
I'm assuming that I hadn't misparsed what you wrote and that __fput() in nested
if() in fput() was enough to get the thing working. Could you confirm that?
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-07-02 12:02 ` Al Viro
@ 2012-07-02 13:01 ` Mimi Zohar
2012-07-02 13:33 ` Al Viro
0 siblings, 1 reply; 54+ messages in thread
From: Mimi Zohar @ 2012-07-02 13:01 UTC (permalink / raw)
To: Al Viro
Cc: Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Mon, 2012-07-02 at 13:02 +0100, Al Viro wrote:
> On Mon, Jul 02, 2012 at 07:49:50AM -0400, Mimi Zohar wrote:
>
> > > > > pid=1 uid=0 d_name=init comm=swapper/0 dev="rootfs" mode=100775
> > > > > pid=1 uid=0 d_name=bash comm=swapper/0 dev="rootfs" mode=100755
> > > >
> > > > OK... Here's what I suspect is going on:
> > > > * populating initramfs writes binaries there. We open files (for write) from
> > > > the kernel thread (there's nothing other than kernel threads at that point), write to
> > > > them, then close(). Final fput() gets delayed.
> > > > * Then we proceed to execve(). Which means mapping the binary with MAP_DENYWRITE.
> > > > Which fails, since there's a struct file still opened for write on that sucker.
> > > >
> > > > Your patch did not delay those fput() - they were done without ->mmap_sem held. So
> > > > it survived. Booting without initramfs always survives; booting with initramfs may
> > > > or may not survive, depending on the timings - if that scheduled work manages to
> > > > run by the time we do those execve(), we win. Note that async_synchronize_full()
> > > > done in init_post() might easily affect that, depending on config.
> > > >
> > > > As a quick test, could you try slapping a delay somewhere around the beginning
> > > > of init_post() and see if it rescues the system?
> > >
> > > Ho-hum... How about this (modulo missing documentation of the whole sad mess):
> >
> > Sorry, neither adding the delay or this patch helped.
>
> Really odd. Could you print the error returned by kernel_execve() in run_init_process()?
> At least that way we'll get some indication of what's going on there. Another thing:
> could you slap matching printks into the nested if() in fput() and the loop in
> delayed_fput(), just to see if we do get __fput() done on all the right struct file?
> Just "fput: %p", file and "delayed_fput: %p", file would probably be enough.
ok, it calls delayed_fput(), but never makes it into the delayed_fput()
while statement. I added an additional printk to make sure
delayed_fput() was being called.
delayed_fput:
fput: xxxx
run_init_process: -26
failed to execute /init
run_init_process: -2
run_init_process: -2
run_init_process: -2
delayed_fput:
fput: xxxx
run_init_process: -26
> I'm assuming that I hadn't misparsed what you wrote and that __fput() in nested
> if() in fput() was enough to get the thing working. Could you confirm that?
Yes, everything is exactly as you described.
Mimi
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-07-02 13:01 ` Mimi Zohar
@ 2012-07-02 13:33 ` Al Viro
2012-07-02 14:50 ` Mimi Zohar
0 siblings, 1 reply; 54+ messages in thread
From: Al Viro @ 2012-07-02 13:33 UTC (permalink / raw)
To: Mimi Zohar
Cc: Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Mon, Jul 02, 2012 at 09:01:31AM -0400, Mimi Zohar wrote:
> ok, it calls delayed_fput(), but never makes it into the delayed_fput()
> while statement.
That's because I'm a blind idiot and managed to miss the idiocy in
delayed_fput() - that
list_splice(&head, &delayed_fput_list);
should've been
list_splice_init(&delayed_fput_list, &head);
Even more embarrassingly, I've actually screwed up the build scripts
and hadn't discovered until now that I'd been testing without initramfs;
as soon as I'd found that, the bug had promptly reproduced itself.
Anyway, I've pushed the fix into vfs.git#master. Should propagate
to git.kernel.org in a few (commit 2ae9632 is the branch head)
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH 0/4] Was: deferring __fput()
2012-07-02 13:33 ` Al Viro
@ 2012-07-02 14:50 ` Mimi Zohar
2012-08-21 13:05 ` [PATCH] task_work: add a scheduling point in task_work_run() Eric Dumazet
0 siblings, 1 reply; 54+ messages in thread
From: Mimi Zohar @ 2012-07-02 14:50 UTC (permalink / raw)
To: Al Viro
Cc: Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Mon, 2012-07-02 at 14:33 +0100, Al Viro wrote:
> Anyway, I've pushed the fix into vfs.git#master. Should propagate
> to git.kernel.org in a few (commit 2ae9632 is the branch head)
Thanks! I'm now able to boot with the delayed __fput().
Mimi
^ permalink raw reply [flat|nested] 54+ messages in thread
* [PATCH] task_work: add a scheduling point in task_work_run()
2012-07-02 14:50 ` Mimi Zohar
@ 2012-08-21 13:05 ` Eric Dumazet
2012-08-21 20:37 ` Mimi Zohar
2012-08-22 5:27 ` Michael Wang
0 siblings, 2 replies; 54+ messages in thread
From: Eric Dumazet @ 2012-08-21 13:05 UTC (permalink / raw)
To: Al Viro
Cc: Mimi Zohar, Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
From: Eric Dumazet <edumazet@google.com>
It seems commit 4a9d4b02 (switch fput to task_work_add) reintroduced
the problem addressed in commit 944be0b2 (close_files(): add scheduling
point)
If a server process with a lot of files (say 2 million tcp sockets)
is killed, we can spend a lot of time in task_work_run() and trigger
a soft lockup.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
kernel/task_work.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 91d4e17..d320d44 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -75,6 +75,7 @@ void task_work_run(void)
p = q->next;
q->func(q);
q = p;
+ cond_resched();
}
}
}
^ permalink raw reply related [flat|nested] 54+ messages in thread
* Re: [PATCH] task_work: add a scheduling point in task_work_run()
2012-08-21 13:05 ` [PATCH] task_work: add a scheduling point in task_work_run() Eric Dumazet
@ 2012-08-21 20:37 ` Mimi Zohar
2012-08-21 21:32 ` Eric Dumazet
2012-08-22 5:27 ` Michael Wang
1 sibling, 1 reply; 54+ messages in thread
From: Mimi Zohar @ 2012-08-21 20:37 UTC (permalink / raw)
To: Eric Dumazet
Cc: Al Viro, Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Tue, 2012-08-21 at 15:05 +0200, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> It seems commit 4a9d4b02 (switch fput to task_work_add) reintroduced
> the problem addressed in commit 944be0b2 (close_files(): add scheduling
> point)
>
> If a server process with a lot of files (say 2 million tcp sockets)
> is killed, we can spend a lot of time in task_work_run() and trigger
> a soft lockup.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
> kernel/task_work.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/kernel/task_work.c b/kernel/task_work.c
> index 91d4e17..d320d44 100644
> --- a/kernel/task_work.c
> +++ b/kernel/task_work.c
> @@ -75,6 +75,7 @@ void task_work_run(void)
> p = q->next;
> q->func(q);
> q = p;
> + cond_resched();
> }
> }
> }
We're here, because fput() called schedule_work() to delay the last
fput(). The execution needs to take place before the syscall returns to
userspace. Need to read __schedule()... Do you know if cond_resched()
can guarantee that it will be executed before the return to userspace?
thanks,
Mimi
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH] task_work: add a scheduling point in task_work_run()
2012-08-21 20:37 ` Mimi Zohar
@ 2012-08-21 21:32 ` Eric Dumazet
2012-08-22 3:13 ` Mimi Zohar
0 siblings, 1 reply; 54+ messages in thread
From: Eric Dumazet @ 2012-08-21 21:32 UTC (permalink / raw)
To: Mimi Zohar
Cc: Al Viro, Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Tue, 2012-08-21 at 16:37 -0400, Mimi Zohar wrote:
> We're here, because fput() called schedule_work() to delay the last
> fput(). The execution needs to take place before the syscall returns to
> userspace. Need to read __schedule()... Do you know if cond_resched()
> can guarantee that it will be executed before the return to userspace?
Some clarifications :
- fput() does not call schedule_work() in this case but task_work_add()
- cond_resched() wont return to userspace.
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH] task_work: add a scheduling point in task_work_run()
2012-08-21 21:32 ` Eric Dumazet
@ 2012-08-22 3:13 ` Mimi Zohar
0 siblings, 0 replies; 54+ messages in thread
From: Mimi Zohar @ 2012-08-22 3:13 UTC (permalink / raw)
To: Eric Dumazet
Cc: Al Viro, Oleg Nesterov, Linus Torvalds, . James Morris,
linux-security-module, linux-kernel, David Howells
On Tue, 2012-08-21 at 23:32 +0200, Eric Dumazet wrote:
> On Tue, 2012-08-21 at 16:37 -0400, Mimi Zohar wrote:
>
> > We're here, because fput() called schedule_work() to delay the last
> > fput(). The execution needs to take place before the syscall returns to
> > userspace. Need to read __schedule()... Do you know if cond_resched()
> > can guarantee that it will be executed before the return to userspace?
>
> Some clarifications :
>
> - fput() does not call schedule_work() in this case but task_work_add()
>
> - cond_resched() wont return to userspace.
Thanks for the clarification.
Mimi
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH] task_work: add a scheduling point in task_work_run()
2012-08-21 13:05 ` [PATCH] task_work: add a scheduling point in task_work_run() Eric Dumazet
2012-08-21 20:37 ` Mimi Zohar
@ 2012-08-22 5:27 ` Michael Wang
2012-08-22 5:38 ` Al Viro
1 sibling, 1 reply; 54+ messages in thread
From: Michael Wang @ 2012-08-22 5:27 UTC (permalink / raw)
To: Eric Dumazet
Cc: Al Viro, Mimi Zohar, Oleg Nesterov, Linus Torvalds,
. James Morris, linux-security-module, linux-kernel,
David Howells
Hi, Eric
On 08/21/2012 09:05 PM, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> It seems commit 4a9d4b02 (switch fput to task_work_add) reintroduced
> the problem addressed in commit 944be0b2 (close_files(): add scheduling
> point)
>
> If a server process with a lot of files (say 2 million tcp sockets)
> is killed, we can spend a lot of time in task_work_run() and trigger
> a soft lockup.
The thread will be rescheduled if we support kernel preempt, so this
change may only help the case we haven't enabled CONFIG_PREEMPT, isn't
it? What about using ifndef?
And can we make sure that it is safe to sleep(schedule) at this point?
It may need some totally testing to cover all the situation...
Regards,
Michael Wang
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
> kernel/task_work.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/kernel/task_work.c b/kernel/task_work.c
> index 91d4e17..d320d44 100644
> --- a/kernel/task_work.c
> +++ b/kernel/task_work.c
> @@ -75,6 +75,7 @@ void task_work_run(void)
> p = q->next;
> q->func(q);
> q = p;
> + cond_resched();
> }
> }
> }
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
^ permalink raw reply [flat|nested] 54+ messages in thread
* Re: [PATCH] task_work: add a scheduling point in task_work_run()
2012-08-22 5:27 ` Michael Wang
@ 2012-08-22 5:38 ` Al Viro
0 siblings, 0 replies; 54+ messages in thread
From: Al Viro @ 2012-08-22 5:38 UTC (permalink / raw)
To: Michael Wang
Cc: Eric Dumazet, Mimi Zohar, Oleg Nesterov, Linus Torvalds,
. James Morris, linux-security-module, linux-kernel,
David Howells
On Wed, Aug 22, 2012 at 01:27:21PM +0800, Michael Wang wrote:
> And can we make sure that it is safe to sleep(schedule) at this point?
> It may need some totally testing to cover all the situation...
task_work callback can bloody well block, so yes, it is safe. Hell,
we are doing final close from that; that can lead to any amount of
IO, up to and including on-disk file freeing and, in case of vfsmount
kept alive by an opened file after we'd done umount -l, actual final
unmount of a filesystem. That can more than just block, that can block
for a long time if that's a network filesystem...
^ permalink raw reply [flat|nested] 54+ messages in thread