netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net] Panic when tc_lookup_action_n finds a partially initialized action.
@ 2016-10-02  3:13 Krister Johansen
  2016-10-03  1:18 ` Jamal Hadi Salim
  2016-10-03 18:22 ` Cong Wang
  0 siblings, 2 replies; 14+ messages in thread
From: Krister Johansen @ 2016-10-02  3:13 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: netdev

A tc_action_ops structure is visibile as soon as it is placed in the
act_base list.  When tcf_regsiter_action adds an item to this list and
drops act_mod_lock, registration is not complete until
register_pernet_subsys() finishes.

If two threads attempt to modify a tc action in a way that triggers a
module load, the thread that wins the race ends up defeferencing a NULL
pointer after tcf_action_init_1() invokes a_o->init().  In the
particular case that this submitter encountered, the panic occurred in
tcf_gact_init() when net_generic() returned a NULL tc_action_net
pointer.  The gact_net_id needed to fetch the correct pointer was not
yet set, because the register_pernet_subsys() call was pending in
another thread.

Fixes: ddf97ccdd7cb ("net_sched: add network namespace support for tc actions")
Signed-off-by: Krister Johansen <kjlx@templeofstupid.com>
---
 include/net/act_api.h |  1 +
 net/sched/act_api.c   | 64 ++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 82f3c91..9ede72d 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -107,6 +107,7 @@ struct tc_action_ops {
 	struct list_head head;
 	char    kind[IFNAMSIZ];
 	__u32   type; /* TBD to match kind */
+	__u32   pernet_pending;
 	size_t	size;
 	struct module		*owner;
 	int     (*act)(struct sk_buff *, const struct tc_action *,
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index d09d068..3b51808 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -331,6 +331,7 @@ EXPORT_SYMBOL(tcf_hashinfo_destroy);
 
 static LIST_HEAD(act_base);
 static DEFINE_RWLOCK(act_mod_lock);
+static DECLARE_WAIT_QUEUE_HEAD(act_mod_wait);
 
 int tcf_register_action(struct tc_action_ops *act,
 			struct pernet_operations *ops)
@@ -349,15 +350,18 @@ int tcf_register_action(struct tc_action_ops *act,
 		}
 	}
 	list_add_tail(&act->head, &act_base);
+	act->pernet_pending = 1;
 	write_unlock(&act_mod_lock);
 
 	ret = register_pernet_subsys(ops);
-	if (ret) {
-		tcf_unregister_action(act, ops);
-		return ret;
-	}
+	write_lock(&act_mod_lock);
+	if (ret)
+		list_del(&act->head);
+	act->pernet_pending = 0;
+	write_unlock(&act_mod_lock);
+	wake_up_all(&act_mod_wait);
 
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL(tcf_register_action);
 
@@ -367,8 +371,6 @@ int tcf_unregister_action(struct tc_action_ops *act,
 	struct tc_action_ops *a;
 	int err = -ENOENT;
 
-	unregister_pernet_subsys(ops);
-
 	write_lock(&act_mod_lock);
 	list_for_each_entry(a, &act_base, head) {
 		if (a == act) {
@@ -378,10 +380,49 @@ int tcf_unregister_action(struct tc_action_ops *act,
 		}
 	}
 	write_unlock(&act_mod_lock);
+
+	if (!err)
+		unregister_pernet_subsys(ops);
+
 	return err;
 }
 EXPORT_SYMBOL(tcf_unregister_action);
 
+static int tcf_module_needed(char *kind)
+{
+	struct tc_action_ops *a;
+	int rc = 0;
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+	if (!kind)
+		return rc;
+
+	add_wait_queue(&act_mod_wait, &wait);
+	read_lock(&act_mod_lock);
+	for (;;) {
+		list_for_each_entry(a, &act_base, head) {
+			if (strcmp(kind, a->kind) == 0)
+				break;
+		}
+
+		if (!a || strcmp(kind, a->kind) != 0) {
+			rc = 1;
+			break;
+		}
+
+		if  (a->pernet_pending == 0 || signal_pending(current))
+			break;
+
+		read_unlock(&act_mod_lock);
+		wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+		read_lock(&act_mod_lock);
+	}
+	read_unlock(&act_mod_lock);
+	remove_wait_queue(&act_mod_wait, &wait);
+
+	return rc;
+}
+
 /* lookup by name */
 static struct tc_action_ops *tc_lookup_action_n(char *kind)
 {
@@ -391,11 +432,14 @@ static struct tc_action_ops *tc_lookup_action_n(char *kind)
 		read_lock(&act_mod_lock);
 		list_for_each_entry(a, &act_base, head) {
 			if (strcmp(kind, a->kind) == 0) {
+				if (a->pernet_pending != 0)
+					goto out;
 				if (try_module_get(a->owner))
 					res = a;
 				break;
 			}
 		}
+out:
 		read_unlock(&act_mod_lock);
 	}
 	return res;
@@ -410,11 +454,14 @@ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)
 		read_lock(&act_mod_lock);
 		list_for_each_entry(a, &act_base, head) {
 			if (nla_strcmp(kind, a->kind) == 0) {
+				if (a->pernet_pending != 0)
+					goto out;
 				if (try_module_get(a->owner))
 					res = a;
 				break;
 			}
 		}
+out:
 		read_unlock(&act_mod_lock);
 	}
 	return res;
@@ -549,7 +596,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
 	if (a_o == NULL) {
 #ifdef CONFIG_MODULES
 		rtnl_unlock();
-		request_module("act_%s", act_name);
+		if (tcf_module_needed(act_name))
+			request_module("act_%s", act_name);
 		rtnl_lock();
 
 		a_o = tc_lookup_action_n(act_name);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH net] Panic when tc_lookup_action_n finds a partially initialized action.
  2016-10-02  3:13 [PATCH net] Panic when tc_lookup_action_n finds a partially initialized action Krister Johansen
@ 2016-10-03  1:18 ` Jamal Hadi Salim
  2016-10-04  6:38   ` Krister Johansen
  2016-10-03 18:22 ` Cong Wang
  1 sibling, 1 reply; 14+ messages in thread
From: Jamal Hadi Salim @ 2016-10-03  1:18 UTC (permalink / raw)
  To: Krister Johansen; +Cc: netdev, Cong Wang

On 16-10-01 11:13 PM, Krister Johansen wrote:
> A tc_action_ops structure is visibile as soon as it is placed in the
> act_base list.  When tcf_regsiter_action adds an item to this list and
> drops act_mod_lock, registration is not complete until
> register_pernet_subsys() finishes.
>
> If two threads attempt to modify a tc action in a way that triggers a
> module load, the thread that wins the race ends up defeferencing a NULL
> pointer after tcf_action_init_1() invokes a_o->init().  In the
> particular case that this submitter encountered, the panic occurred in
> tcf_gact_init() when net_generic() returned a NULL tc_action_net
> pointer.  The gact_net_id needed to fetch the correct pointer was not
> yet set, because the register_pernet_subsys() call was pending in
> another thread.
>
> Fixes: ddf97ccdd7cb ("net_sched: add network namespace support for tc actions")
> Signed-off-by: Krister Johansen <kjlx@templeofstupid.com>

Looks reasonable to me but will let Cong a closer look since he added
that code.

cheers,
jamal

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net] Panic when tc_lookup_action_n finds a partially initialized action.
  2016-10-02  3:13 [PATCH net] Panic when tc_lookup_action_n finds a partially initialized action Krister Johansen
  2016-10-03  1:18 ` Jamal Hadi Salim
@ 2016-10-03 18:22 ` Cong Wang
  2016-10-04  6:39   ` Krister Johansen
  2016-10-05  6:52   ` Krister Johansen
  1 sibling, 2 replies; 14+ messages in thread
From: Cong Wang @ 2016-10-03 18:22 UTC (permalink / raw)
  To: Krister Johansen; +Cc: Jamal Hadi Salim, Linux Kernel Network Developers

[-- Attachment #1: Type: text/plain, Size: 790 bytes --]

On Sat, Oct 1, 2016 at 8:13 PM, Krister Johansen
<kjlx@templeofstupid.com> wrote:
> A tc_action_ops structure is visibile as soon as it is placed in the
> act_base list.  When tcf_regsiter_action adds an item to this list and
> drops act_mod_lock, registration is not complete until
> register_pernet_subsys() finishes.

Hmm, good catch, but does the fix have to be so complicated?

How about moving register_pernet_subsys() under act_mod_lock?
Similar is needed for unregister too of course. This also means
we need to convert act_mod_lock to a mutex which allows blocking.
Fortunately, we don't have to take act_mod_lock in any atomic context.

Please try the attached patch. I also convert the read path to RCU
to avoid a possible deadlock. A quick test shows no lockdep splat.

Thanks!

[-- Attachment #2: act_api.diff --]
[-- Type: text/plain, Size: 3031 bytes --]

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index d09d068..4aac846 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -330,7 +330,22 @@ void tcf_hashinfo_destroy(const struct tc_action_ops *ops,
 EXPORT_SYMBOL(tcf_hashinfo_destroy);
 
 static LIST_HEAD(act_base);
-static DEFINE_RWLOCK(act_mod_lock);
+static DEFINE_MUTEX(act_mod_lock);
+
+static int __tcf_unregister_action(struct tc_action_ops *act)
+{
+	struct tc_action_ops *a;
+	int err = -ENOENT;
+
+	list_for_each_entry(a, &act_base, head) {
+		if (a == act) {
+			list_del_rcu(&act->head);
+			err = 0;
+			break;
+		}
+	}
+	return err;
+}
 
 int tcf_register_action(struct tc_action_ops *act,
 			struct pernet_operations *ops)
@@ -341,22 +356,23 @@ int tcf_register_action(struct tc_action_ops *act,
 	if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup)
 		return -EINVAL;
 
-	write_lock(&act_mod_lock);
+	mutex_lock(&act_mod_lock);
 	list_for_each_entry(a, &act_base, head) {
 		if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
-			write_unlock(&act_mod_lock);
+			mutex_unlock(&act_mod_lock);
 			return -EEXIST;
 		}
 	}
-	list_add_tail(&act->head, &act_base);
-	write_unlock(&act_mod_lock);
+	list_add_tail_rcu(&act->head, &act_base);
 
 	ret = register_pernet_subsys(ops);
 	if (ret) {
-		tcf_unregister_action(act, ops);
+		__tcf_unregister_action(act);
+		mutex_unlock(&act_mod_lock);
 		return ret;
 	}
 
+	mutex_unlock(&act_mod_lock);
 	return 0;
 }
 EXPORT_SYMBOL(tcf_register_action);
@@ -364,20 +380,13 @@ EXPORT_SYMBOL(tcf_register_action);
 int tcf_unregister_action(struct tc_action_ops *act,
 			  struct pernet_operations *ops)
 {
-	struct tc_action_ops *a;
 	int err = -ENOENT;
 
+	mutex_lock(&act_mod_lock);
 	unregister_pernet_subsys(ops);
+	err = __tcf_unregister_action(act);
+	mutex_unlock(&act_mod_lock);
 
-	write_lock(&act_mod_lock);
-	list_for_each_entry(a, &act_base, head) {
-		if (a == act) {
-			list_del(&act->head);
-			err = 0;
-			break;
-		}
-	}
-	write_unlock(&act_mod_lock);
 	return err;
 }
 EXPORT_SYMBOL(tcf_unregister_action);
@@ -388,15 +397,15 @@ static struct tc_action_ops *tc_lookup_action_n(char *kind)
 	struct tc_action_ops *a, *res = NULL;
 
 	if (kind) {
-		read_lock(&act_mod_lock);
-		list_for_each_entry(a, &act_base, head) {
+		rcu_read_lock();
+		list_for_each_entry_rcu(a, &act_base, head) {
 			if (strcmp(kind, a->kind) == 0) {
 				if (try_module_get(a->owner))
 					res = a;
 				break;
 			}
 		}
-		read_unlock(&act_mod_lock);
+		rcu_read_unlock();
 	}
 	return res;
 }
@@ -407,15 +416,15 @@ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)
 	struct tc_action_ops *a, *res = NULL;
 
 	if (kind) {
-		read_lock(&act_mod_lock);
-		list_for_each_entry(a, &act_base, head) {
+		rcu_read_lock();
+		list_for_each_entry_rcu(a, &act_base, head) {
 			if (nla_strcmp(kind, a->kind) == 0) {
 				if (try_module_get(a->owner))
 					res = a;
 				break;
 			}
 		}
-		read_unlock(&act_mod_lock);
+		rcu_read_unlock();
 	}
 	return res;
 }

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH net] Panic when tc_lookup_action_n finds a partially initialized action.
  2016-10-03  1:18 ` Jamal Hadi Salim
@ 2016-10-04  6:38   ` Krister Johansen
  0 siblings, 0 replies; 14+ messages in thread
From: Krister Johansen @ 2016-10-04  6:38 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: Krister Johansen, netdev, Cong Wang

On Sun, Oct 02, 2016 at 09:18:06PM -0400, Jamal Hadi Salim wrote:
> On 16-10-01 11:13 PM, Krister Johansen wrote:
> >A tc_action_ops structure is visibile as soon as it is placed in the
> >act_base list.  When tcf_regsiter_action adds an item to this list and
> >drops act_mod_lock, registration is not complete until
> >register_pernet_subsys() finishes.
> >
> >If two threads attempt to modify a tc action in a way that triggers a
> >module load, the thread that wins the race ends up defeferencing a NULL
> >pointer after tcf_action_init_1() invokes a_o->init().  In the
> >particular case that this submitter encountered, the panic occurred in
> >tcf_gact_init() when net_generic() returned a NULL tc_action_net
> >pointer.  The gact_net_id needed to fetch the correct pointer was not
> >yet set, because the register_pernet_subsys() call was pending in
> >another thread.
> >
> >Fixes: ddf97ccdd7cb ("net_sched: add network namespace support for tc actions")
> >Signed-off-by: Krister Johansen <kjlx@templeofstupid.com>
> 
> Looks reasonable to me but will let Cong a closer look since he added
> that code.

Thanks, I appreicate you taking a look.  I'll follow up with Cong.

-K

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net] Panic when tc_lookup_action_n finds a partially initialized action.
  2016-10-03 18:22 ` Cong Wang
@ 2016-10-04  6:39   ` Krister Johansen
  2016-10-05  6:52   ` Krister Johansen
  1 sibling, 0 replies; 14+ messages in thread
From: Krister Johansen @ 2016-10-04  6:39 UTC (permalink / raw)
  To: Cong Wang
  Cc: Krister Johansen, Jamal Hadi Salim, Linux Kernel Network Developers

Hi Cong,

Thanks for the feedback.

On Mon, Oct 03, 2016 at 11:22:33AM -0700, Cong Wang wrote:
> On Sat, Oct 1, 2016 at 8:13 PM, Krister Johansen
> <kjlx@templeofstupid.com> wrote:
> > A tc_action_ops structure is visibile as soon as it is placed in the
> > act_base list.  When tcf_regsiter_action adds an item to this list and
> > drops act_mod_lock, registration is not complete until
> > register_pernet_subsys() finishes.
> 
> Hmm, good catch, but does the fix have to be so complicated?

There were two reasons that the patch I submitted was more complicated
than your proposal.  The first is simply my own lack of knowledge.  I
didn't see many other net subsystems that held locks across the call to
register_pernet_subsys().  I avoided doing so out of caution / paranoia.

The other reason for blocking if a register_pernet_subsys() was already
pending is the behavior of this code when the lookup fails.  The code in
tcf_action_init_1() calls request_module() when tc_lookup_action_n()
fails.  In the cases that I observed, this could lead to hundreds
modprobe processes running for essentially the same few modules.  Only
one of these calls will succeed.

Since the call to request_module() will sleep until the modprobe process
exits, it didn't seem unreasonable to block other threads in the same
code path.  Instead of blocking on a redundant modprobe call, it blocks
pending the completion of a modprobe that's already in progress.

I admit that the patch I submitted didn't close this window entirely,
but in the tests that I ran I was able to see the number of concurrent
modprobe processes go from dozens down to just a few.

> How about moving register_pernet_subsys() under act_mod_lock?
> Similar is needed for unregister too of course. This also means
> we need to convert act_mod_lock to a mutex which allows blocking.
> Fortunately, we don't have to take act_mod_lock in any atomic context.

If it's permissible to hold act_mod_lock across the call to
register_pernet_subsys, then perhaps this could instead be simplified to
use mutex_lock_interruptible() instead of RCU locking.  The blocking
lock would prevent other operations from triggering a modprobe until
the outstanding load completes.  However, the downside is that any
request_module() would block all other lookups.  My attempt to get
around that problem was to record on the action ops whether a pernet
operation was in progress.

> Please try the attached patch. I also convert the read path to RCU
> to avoid a possible deadlock. A quick test shows no lockdep splat.

I'll give it a try, but it may take me a few days to report back with
results.  In the meantime, let's try to reach consensus on an acceptable
solution.

Thanks again,

-K

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net] Panic when tc_lookup_action_n finds a partially initialized action.
  2016-10-03 18:22 ` Cong Wang
  2016-10-04  6:39   ` Krister Johansen
@ 2016-10-05  6:52   ` Krister Johansen
  2016-10-05 18:01     ` Cong Wang
  1 sibling, 1 reply; 14+ messages in thread
From: Krister Johansen @ 2016-10-05  6:52 UTC (permalink / raw)
  To: Cong Wang
  Cc: Krister Johansen, Jamal Hadi Salim, Linux Kernel Network Developers

On Mon, Oct 03, 2016 at 11:22:33AM -0700, Cong Wang wrote:
> Please try the attached patch. I also convert the read path to RCU
> to avoid a possible deadlock. A quick test shows no lockdep splat.

I tried this patch, but it doesn't solve the problem.  I got a panic on
my very first try:

  SYSTEM MAP: /boot/System.map-4.7.5-1.el7.x86_64
DEBUG KERNEL: /usr/lib/debug/lib/modules/4.7.5-1.el7.x86_64/vmlinux (4.7.5-1.el7.x86_64)
    DUMPFILE: ./vmcore  [PARTIAL DUMP]
        CPUS: 4
        DATE: Tue Oct  4 22:43:45 2016
      UPTIME: 00:02:32
LOAD AVERAGE: 0.20, 0.14, 0.06
       TASKS: 1637
     RELEASE: 4.7.5-1.el7.x86_64
     VERSION: #1 SMP Tue Oct 4 21:58:08 PDT 2016
     MACHINE: x86_64  (3392 Mhz)
      MEMORY: 4 GB
       PANIC: "BUG: unable to handle kernel NULL pointer dereference at           (null)"
         PID: 20151
     COMMAND: "test"
        TASK: ffff880138f89480  [THREAD_INFO: ffff88009f450000]
         CPU: 2
       STATE: TASK_RUNNING (PANIC)

PID: 20151  TASK: ffff880138f89480  CPU: 2   COMMAND: "test"
 #0 [ffff88009f453368] machine_kexec at ffffffff8105c2db
 #1 [ffff88009f4533c8] __crash_kexec at ffffffff81111ca2
 #2 [ffff88009f453498] crash_kexec at ffffffff81111d9c
 #3 [ffff88009f4534b8] oops_end at ffffffff810309d1
 #4 [ffff88009f4534e0] no_context at ffffffff8106a5de
 #5 [ffff88009f453540] __bad_area_nosemaphore at ffffffff8106a92e
 #6 [ffff88009f453590] bad_area at ffffffff81081b45
 #7 [ffff88009f4535b8] __do_page_fault at ffffffff8106b42d
 #8 [ffff88009f453620] trace_do_page_fault at ffffffff8106b5a3
 #9 [ffff88009f453658] do_async_page_fault at ffffffff81063dda
#10 [ffff88009f453670] async_page_fault at ffffffff81735af8
    [exception RIP: tcf_hash_check+18]
    RIP: ffffffff81649772  RSP: ffff88009f453720  RFLAGS: 00010246
    RAX: 0000000000000000  RBX: 0000000000000001  RCX: 0000000000000001
    RDX: ffff8801297ee440  RSI: 0000000000000000  RDI: 0000000000000000
    RBP: ffff88009f453738   R8: ffffffffa04020f0   R9: ffff88009f453770
    R10: ffffffff8164a6a1  R11: ffff880129621f00  R12: ffff8801297ee440
    R13: ffff8800ba0f9940  R14: ffff880093cd3e6c  R15: 0000000000000000
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
#11 [ffff88009f453740] tcf_mirred_init at ffffffffa04011cc [act_mirred]
#12 [ffff88009f4537c8] tcf_action_init_1 at ffffffff8164a7c9
#13 [ffff88009f453868] tcf_action_init at ffffffff8164a881
#14 [ffff88009f4539c0] tcf_exts_validate at ffffffff8164868a
#15 [ffff88009f4539e8] u32_set_parms at ffffffffa04153f2 [cls_u32]
#16 [ffff88009f453a60] u32_change at ffffffffa0416bc4 [cls_u32]
#17 [ffff88009f453b50] tc_ctl_tfilter at ffffffff8164900d
#18 [ffff88009f453c48] rtnetlink_rcv_msg at ffffffff8162b8f4
#19 [ffff88009f453cc0] netlink_rcv_skb at ffffffff81650b77
#20 [ffff88009f453ce8] rtnetlink_rcv at ffffffff8162b848
#21 [ffff88009f453d00] netlink_unicast at ffffffff81650531
#22 [ffff88009f453d48] netlink_sendmsg at ffffffff8165091e
#23 [ffff88009f453dc8] sock_sendmsg at ffffffff815fcae8
#24 [ffff88009f453de8] SYSC_sendto at ffffffff815fcfa2
#25 [ffff88009f453f18] sys_sendto at ffffffff815fdb4e
#26 [ffff88009f453f28] do_syscall_64 at ffffffff81003b12
    RIP: 00000000004cbfba  RSP: 000000c8200a4eb8  RFLAGS: 00000212
    RAX: ffffffffffffffda  RBX: 000000c820000180  RCX: 00000000004cbfba
    RDX: 000000000000008c  RSI: 000000c82007a900  RDI: 0000000000000007
    RBP: 0000000000000000   R8: 000000c8200fd844   R9: 000000000000000c
    R10: 0000000000000000  R11: 0000000000000212  R12: 000000c82007a900
    R13: 0000000000000003  R14: 0012492492492492  R15: 0000000000000039
    ORIG_RAX: 000000000000002c  CS: 0033  SS: 002b

The problem here is the same as before: by using RCU the race isn't
fixed because the module is still discoverable from act_base before the
pernet initialization is completed.

You can see from the trap frame that the first two arguments to
tcf_hash_check were 0.  It couldn't look up the correct per-subsystem
pointer because the id hadn't yet been registered.

> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> index d09d068..4aac846 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
>  int tcf_register_action(struct tc_action_ops *act,
>  			struct pernet_operations *ops)
> @@ -341,22 +356,23 @@ int tcf_register_action(struct tc_action_ops *act,
>  	if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup)
>  		return -EINVAL;
>  
> -	write_lock(&act_mod_lock);
> +	mutex_lock(&act_mod_lock);
>  	list_for_each_entry(a, &act_base, head) {
>  		if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
> -			write_unlock(&act_mod_lock);
> +			mutex_unlock(&act_mod_lock);
>  			return -EEXIST;
>  		}
>  	}
> -	list_add_tail(&act->head, &act_base);
> -	write_unlock(&act_mod_lock);
> +	list_add_tail_rcu(&act->head, &act_base);
>  
>  	ret = register_pernet_subsys(ops);
>  	if (ret) {
> -		tcf_unregister_action(act, ops);
> +		__tcf_unregister_action(act);
> +		mutex_unlock(&act_mod_lock);
>  		return ret;
>  	}
>  
> +	mutex_unlock(&act_mod_lock);
>  	return 0;
>  }

If lookups use rcu, then the list_add_tail_rcu() has to come after
register_pernet_subsys() returns success.

>  EXPORT_SYMBOL(tcf_register_action);
> @@ -364,20 +380,13 @@ EXPORT_SYMBOL(tcf_register_action);
>  int tcf_unregister_action(struct tc_action_ops *act,
>  			  struct pernet_operations *ops)
>  {
> -	struct tc_action_ops *a;
>  	int err = -ENOENT;
>  
> +	mutex_lock(&act_mod_lock);
>  	unregister_pernet_subsys(ops);
> +	err = __tcf_unregister_action(act);
> +	mutex_unlock(&act_mod_lock);
>  
> -	write_lock(&act_mod_lock);
> -	list_for_each_entry(a, &act_base, head) {
> -		if (a == act) {
> -			list_del(&act->head);
> -			err = 0;
> -			break;
> -		}
> -	}
> -	write_unlock(&act_mod_lock);
>  	return err;
>  }

Similarly, unregistering the pernet_subsystem before ensuring that the
action has been removed from act_base leads to another version of this
race on module unload.  Presumably you need to wait for the rcu grace
period to elapse upon removal before it's actually safe to unregister
the subsystem, since any callers with references to the action will have
the potential for a similar NULL deference if their per-net id suddenly
expires.

>  EXPORT_SYMBOL(tcf_unregister_action);
> @@ -388,15 +397,15 @@ static struct tc_action_ops *tc_lookup_action_n(char *kind)
>  	struct tc_action_ops *a, *res = NULL;
>  
>  	if (kind) {
> -		read_lock(&act_mod_lock);
> -		list_for_each_entry(a, &act_base, head) {
> +		rcu_read_lock();
> +		list_for_each_entry_rcu(a, &act_base, head) {
>  			if (strcmp(kind, a->kind) == 0) {
>  				if (try_module_get(a->owner))
>  					res = a;
>  				break;
>  			}
>  		}
> -		read_unlock(&act_mod_lock);
> +		rcu_read_unlock();
>  	}
>  	return res;
>  }
> @@ -407,15 +416,15 @@ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)
>  	struct tc_action_ops *a, *res = NULL;
>  
>  	if (kind) {
> -		read_lock(&act_mod_lock);
> -		list_for_each_entry(a, &act_base, head) {
> +		rcu_read_lock();
> +		list_for_each_entry_rcu(a, &act_base, head) {
>  			if (nla_strcmp(kind, a->kind) == 0) {
>  				if (try_module_get(a->owner))
>  					res = a;
>  				break;
>  			}
>  		}
> -		read_unlock(&act_mod_lock);
> +		rcu_read_unlock();
>  	}
>  	return res;
>  }

Given the fact that RCU tolerates inherent races, but another trip
around the modprobe loop blocks the caller, and forks an unnecessary
process, it's not clear to me that converting this to RCU makes it
simpler, or is the right tool for this particular job.

After testing it, I'm not convinced that this is any less complicated
than the approach that I proposed.

Thanks,

-K

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net] Panic when tc_lookup_action_n finds a partially initialized action.
  2016-10-05  6:52   ` Krister Johansen
@ 2016-10-05 18:01     ` Cong Wang
  2016-10-05 18:07       ` Cong Wang
                         ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Cong Wang @ 2016-10-05 18:01 UTC (permalink / raw)
  To: Krister Johansen; +Cc: Jamal Hadi Salim, Linux Kernel Network Developers

[-- Attachment #1: Type: text/plain, Size: 1475 bytes --]

On Tue, Oct 4, 2016 at 11:52 PM, Krister Johansen
<kjlx@templeofstupid.com> wrote:
> On Mon, Oct 03, 2016 at 11:22:33AM -0700, Cong Wang wrote:
>> Please try the attached patch. I also convert the read path to RCU
>> to avoid a possible deadlock. A quick test shows no lockdep splat.
>
> I tried this patch, but it doesn't solve the problem.  I got a panic on
> my very first try:

Thanks for testing it.


> The problem here is the same as before: by using RCU the race isn't
> fixed because the module is still discoverable from act_base before the
> pernet initialization is completed.
>
> You can see from the trap frame that the first two arguments to
> tcf_hash_check were 0.  It couldn't look up the correct per-subsystem
> pointer because the id hadn't yet been registered.

I thought the problem is that we don't do pernet ops registration and
action ops registration atomically therefore chose to use mutex+RCU,
but I was wrong, the problem here is just ordering, we need to finish
the pernet initialization before making action ops visible.

If so, why not just reorder them? Does the attached patch make any
sense now? Our pernet init doesn't rely on act_base, so even we have
some race, the worst case is after we initialize the pernet netns for an
action but its ops still not visible, which seems fine (at least no crash).

Or I still miss something here?

(Sorry that I don't have the environment to reproduce your bug)

Thanks for your patience and testing!

[-- Attachment #2: act_api_register.diff --]
[-- Type: text/plain, Size: 1306 bytes --]

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index d09d068..6024920 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -341,22 +341,21 @@ int tcf_register_action(struct tc_action_ops *act,
 	if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup)
 		return -EINVAL;
 
+	ret = register_pernet_subsys(ops);
+	if (ret)
+		return ret;
+
 	write_lock(&act_mod_lock);
 	list_for_each_entry(a, &act_base, head) {
 		if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
 			write_unlock(&act_mod_lock);
+			unregister_pernet_subsys(ops);
 			return -EEXIST;
 		}
 	}
 	list_add_tail(&act->head, &act_base);
 	write_unlock(&act_mod_lock);
 
-	ret = register_pernet_subsys(ops);
-	if (ret) {
-		tcf_unregister_action(act, ops);
-		return ret;
-	}
-
 	return 0;
 }
 EXPORT_SYMBOL(tcf_register_action);
@@ -367,8 +366,6 @@ int tcf_unregister_action(struct tc_action_ops *act,
 	struct tc_action_ops *a;
 	int err = -ENOENT;
 
-	unregister_pernet_subsys(ops);
-
 	write_lock(&act_mod_lock);
 	list_for_each_entry(a, &act_base, head) {
 		if (a == act) {
@@ -378,6 +375,8 @@ int tcf_unregister_action(struct tc_action_ops *act,
 		}
 	}
 	write_unlock(&act_mod_lock);
+	if (!err)
+		unregister_pernet_subsys(ops);
 	return err;
 }
 EXPORT_SYMBOL(tcf_unregister_action);

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH net] Panic when tc_lookup_action_n finds a partially initialized action.
  2016-10-05 18:01     ` Cong Wang
@ 2016-10-05 18:07       ` Cong Wang
  2016-10-06  6:11       ` Krister Johansen
  2016-10-11  9:28       ` Krister Johansen
  2 siblings, 0 replies; 14+ messages in thread
From: Cong Wang @ 2016-10-05 18:07 UTC (permalink / raw)
  To: Krister Johansen; +Cc: Jamal Hadi Salim, Linux Kernel Network Developers

On Wed, Oct 5, 2016 at 11:01 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> On Tue, Oct 4, 2016 at 11:52 PM, Krister Johansen
> <kjlx@templeofstupid.com> wrote:
>> On Mon, Oct 03, 2016 at 11:22:33AM -0700, Cong Wang wrote:
>>> Please try the attached patch. I also convert the read path to RCU
>>> to avoid a possible deadlock. A quick test shows no lockdep splat.
>>
>> I tried this patch, but it doesn't solve the problem.  I got a panic on
>> my very first try:
>
> Thanks for testing it.
>
>
>> The problem here is the same as before: by using RCU the race isn't
>> fixed because the module is still discoverable from act_base before the
>> pernet initialization is completed.
>>
>> You can see from the trap frame that the first two arguments to
>> tcf_hash_check were 0.  It couldn't look up the correct per-subsystem
>> pointer because the id hadn't yet been registered.
>
> I thought the problem is that we don't do pernet ops registration and
> action ops registration atomically therefore chose to use mutex+RCU,
> but I was wrong, the problem here is just ordering, we need to finish
> the pernet initialization before making action ops visible.
>
> If so, why not just reorder them? Does the attached patch make any
> sense now? Our pernet init doesn't rely on act_base, so even we have
> some race, the worst case is after we initialize the pernet netns for an
> action but its ops still not visible, which seems fine (at least no crash).

BTW, I should remove the 'if' check for unregister_pernet_subsys() in
tcf_unregister_action()... Otherwise the error path doesn't work. ;-)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net] Panic when tc_lookup_action_n finds a partially initialized action.
  2016-10-05 18:01     ` Cong Wang
  2016-10-05 18:07       ` Cong Wang
@ 2016-10-06  6:11       ` Krister Johansen
  2016-10-06 19:01         ` Cong Wang
  2016-10-11  9:28       ` Krister Johansen
  2 siblings, 1 reply; 14+ messages in thread
From: Krister Johansen @ 2016-10-06  6:11 UTC (permalink / raw)
  To: Cong Wang
  Cc: Krister Johansen, Jamal Hadi Salim, Linux Kernel Network Developers

On Wed, Oct 05, 2016 at 11:01:38AM -0700, Cong Wang wrote:
> On Tue, Oct 4, 2016 at 11:52 PM, Krister Johansen
> <kjlx@templeofstupid.com> wrote:
> > On Mon, Oct 03, 2016 at 11:22:33AM -0700, Cong Wang wrote:
> >> Please try the attached patch. I also convert the read path to RCU
> >> to avoid a possible deadlock. A quick test shows no lockdep splat.
> >
> > I tried this patch, but it doesn't solve the problem.  I got a panic on
> > my very first try:
> 
> Thanks for testing it.

Absolutely; thanks for helping to try to simplify this fix.

> > The problem here is the same as before: by using RCU the race isn't
> > fixed because the module is still discoverable from act_base before the
> > pernet initialization is completed.
> >
> > You can see from the trap frame that the first two arguments to
> > tcf_hash_check were 0.  It couldn't look up the correct per-subsystem
> > pointer because the id hadn't yet been registered.
> 
> I thought the problem is that we don't do pernet ops registration and
> action ops registration atomically therefore chose to use mutex+RCU,
> but I was wrong, the problem here is just ordering, we need to finish
> the pernet initialization before making action ops visible.
> 
> If so, why not just reorder them? Does the attached patch make any
> sense now? Our pernet init doesn't rely on act_base, so even we have
> some race, the worst case is after we initialize the pernet netns for an
> action but its ops still not visible, which seems fine (at least no crash).
> 
> Or I still miss something here?

I'm not sure.  The reason I didn't take this approach from the outset is
that all of TC's callers of tcf_register_action pass a pointer to a
static structure as their *ops argument.  The existence of code that
checks the action for uniqueness suggests that it's possible for
tcf_register_action to get passed two identical tc_action_ops.  If that
happens in the current code base, we'll also get passed a duplicate
pernet_operations pointer.  The code in register_pernet_subsys() makes
no attempt to check for duplicates.  If we add a pointer that's already
in the list, and subsequently call unregister, the results seem
undefined.  It looks like we'll remove the pernet_operations for the
existing action, assuming we don't corrupt the list in the process.

Is this actually safe?  If so, what corner case is the act->type /
act->kind protecting us from?

> (Sorry that I don't have the environment to reproduce your bug)

I'm sorry that I didn't do a good job of explaining how we end up in
this situation in the first place.  I can give a few more details,
because it may explain some of my concern about the request_module()
call.

The system that encounters this bug launches a bunch of containers from
systemd on boot.  Each container creates a new user, net, pid, and mount
namespace and begins its setup.  When the networking in all of these
containers, each in a new netns, try to configure TC and no modules are
loaded we end up with this race.

I can also reproduce by unloading the modules, and then launching a
bunch of processes that configure tc in new namespaces.

Part of the desire to inhibit extra modprobe calls is that if hundreds
of these all start at once on boot, it's really unnecessary to have all
of the rest of them wait while lots of extra modprobe calls are forked
by the kernel.

> Thanks for your patience and testing!

Thank you for taking the time to look through the fix and discuss
alternatives.

-K

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net] Panic when tc_lookup_action_n finds a partially initialized action.
  2016-10-06  6:11       ` Krister Johansen
@ 2016-10-06 19:01         ` Cong Wang
  2016-10-09  6:13           ` Krister Johansen
  0 siblings, 1 reply; 14+ messages in thread
From: Cong Wang @ 2016-10-06 19:01 UTC (permalink / raw)
  To: Krister Johansen; +Cc: Jamal Hadi Salim, Linux Kernel Network Developers

On Wed, Oct 5, 2016 at 11:11 PM, Krister Johansen
<kjlx@templeofstupid.com> wrote:
>
> I'm not sure.  The reason I didn't take this approach from the outset is
> that all of TC's callers of tcf_register_action pass a pointer to a
> static structure as their *ops argument.  The existence of code that
> checks the action for uniqueness suggests that it's possible for
> tcf_register_action to get passed two identical tc_action_ops.  If that
> happens in the current code base, we'll also get passed a duplicate

Each tc action module has its own unique ops, and kernel doesn't allow
one module to register twice (either in parallel or not, see
add_unformed_module()), so we should not have a duplicated case.


> pernet_operations pointer.  The code in register_pernet_subsys() makes
> no attempt to check for duplicates.  If we add a pointer that's already
> in the list, and subsequently call unregister, the results seem
> undefined.  It looks like we'll remove the pernet_operations for the
> existing action, assuming we don't corrupt the list in the process.
>
> Is this actually safe?  If so, what corner case is the act->type /
> act->kind protecting us from?

ops->type and ops->kind should be unique too, user-space already
relies on this (tc action ls action xxx). The code exists probably just
for sanity check.

So please give that patch a try, let's see if we miss any other problem.

>
>> (Sorry that I don't have the environment to reproduce your bug)
>
> I'm sorry that I didn't do a good job of explaining how we end up in
> this situation in the first place.  I can give a few more details,
> because it may explain some of my concern about the request_module()
> call.
>
> The system that encounters this bug launches a bunch of containers from
> systemd on boot.  Each container creates a new user, net, pid, and mount
> namespace and begins its setup.  When the networking in all of these
> containers, each in a new netns, try to configure TC and no modules are
> loaded we end up with this race.
>
> I can also reproduce by unloading the modules, and then launching a
> bunch of processes that configure tc in new namespaces.
>
> Part of the desire to inhibit extra modprobe calls is that if hundreds
> of these all start at once on boot, it's really unnecessary to have all
> of the rest of them wait while lots of extra modprobe calls are forked
> by the kernel.

You can tell systemd to load these modules before starting these
containers to avoid blocking, no?

Thanks.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net] Panic when tc_lookup_action_n finds a partially initialized action.
  2016-10-06 19:01         ` Cong Wang
@ 2016-10-09  6:13           ` Krister Johansen
  2016-10-11 17:36             ` Cong Wang
  0 siblings, 1 reply; 14+ messages in thread
From: Krister Johansen @ 2016-10-09  6:13 UTC (permalink / raw)
  To: Cong Wang
  Cc: Krister Johansen, Jamal Hadi Salim, Linux Kernel Network Developers

Hi Cong,
Thanks for the follow-up.

On Thu, Oct 06, 2016 at 12:01:15PM -0700, Cong Wang wrote:
> On Wed, Oct 5, 2016 at 11:11 PM, Krister Johansen
> > pernet_operations pointer.  The code in register_pernet_subsys() makes
> > no attempt to check for duplicates.  If we add a pointer that's already
> > in the list, and subsequently call unregister, the results seem
> > undefined.  It looks like we'll remove the pernet_operations for the
> > existing action, assuming we don't corrupt the list in the process.
> >
> > Is this actually safe?  If so, what corner case is the act->type /
> > act->kind protecting us from?
> 
> ops->type and ops->kind should be unique too, user-space already
> relies on this (tc action ls action xxx). The code exists probably just
> for sanity check.

With that in mind, would it make sense to change the check to a WARN/BUG
or some kind of assertion?  I mistakenly inferred that it was possible to
legtimately end up in this scenario.

> So please give that patch a try, let's see if we miss any other problem.

Will do.  I have not forgotten.  I hope to have results for you in a few
days.

> > Part of the desire to inhibit extra modprobe calls is that if hundreds
> > of these all start at once on boot, it's really unnecessary to have all
> > of the rest of them wait while lots of extra modprobe calls are forked
> > by the kernel.
> 
> You can tell systemd to load these modules before starting these
> containers to avoid blocking, no?

That was exactly what I did to work around the panic until I was able to
get a patch together.  The preload of the modules is still occurring,
but I was hoping to excise that workaround entirely.

Thanks,

-K

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net] Panic when tc_lookup_action_n finds a partially initialized action.
  2016-10-05 18:01     ` Cong Wang
  2016-10-05 18:07       ` Cong Wang
  2016-10-06  6:11       ` Krister Johansen
@ 2016-10-11  9:28       ` Krister Johansen
  2016-10-11 17:51         ` Cong Wang
  2 siblings, 1 reply; 14+ messages in thread
From: Krister Johansen @ 2016-10-11  9:28 UTC (permalink / raw)
  To: Cong Wang; +Cc: Jamal Hadi Salim, Linux Kernel Network Developers

On Wed, Oct 05, 2016 at 11:01:38AM -0700, Cong Wang wrote:
> Does the attached patch make any sense now? Our pernet init doesn't
> rely on act_base, so even we have some race, the worst case is after
> we initialize the pernet netns for an action but its ops still not
> visible, which seems fine (at least no crash).

I tried to reproduce the panic with this latest patch, but I am unable
to do so.  The one difference I notice between this patch, and the one I
sent to the list, is that with yours it takes much longer before we get
any output from the simultaneous launch of these containers.  Presumably
that's the extra latency added by allowing many extra modprobe calls to
get spawned by request_module().

-K

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net] Panic when tc_lookup_action_n finds a partially initialized action.
  2016-10-09  6:13           ` Krister Johansen
@ 2016-10-11 17:36             ` Cong Wang
  0 siblings, 0 replies; 14+ messages in thread
From: Cong Wang @ 2016-10-11 17:36 UTC (permalink / raw)
  To: Krister Johansen; +Cc: Jamal Hadi Salim, Linux Kernel Network Developers

On Sat, Oct 8, 2016 at 11:13 PM, Krister Johansen
<kjlx@templeofstupid.com> wrote:
> Hi Cong,
> Thanks for the follow-up.
>
> On Thu, Oct 06, 2016 at 12:01:15PM -0700, Cong Wang wrote:
>> On Wed, Oct 5, 2016 at 11:11 PM, Krister Johansen
>> > pernet_operations pointer.  The code in register_pernet_subsys() makes
>> > no attempt to check for duplicates.  If we add a pointer that's already
>> > in the list, and subsequently call unregister, the results seem
>> > undefined.  It looks like we'll remove the pernet_operations for the
>> > existing action, assuming we don't corrupt the list in the process.
>> >
>> > Is this actually safe?  If so, what corner case is the act->type /
>> > act->kind protecting us from?
>>
>> ops->type and ops->kind should be unique too, user-space already
>> relies on this (tc action ls action xxx). The code exists probably just
>> for sanity check.
>
> With that in mind, would it make sense to change the check to a WARN/BUG
> or some kind of assertion?  I mistakenly inferred that it was possible to
> legtimately end up in this scenario.


Yes, it makes sense to me.


>> > Part of the desire to inhibit extra modprobe calls is that if hundreds
>> > of these all start at once on boot, it's really unnecessary to have all
>> > of the rest of them wait while lots of extra modprobe calls are forked
>> > by the kernel.
>>
>> You can tell systemd to load these modules before starting these
>> containers to avoid blocking, no?
>
> That was exactly what I did to work around the panic until I was able to
> get a patch together.  The preload of the modules is still occurring,
> but I was hoping to excise that workaround entirely.

Or you can compile these modules into kernel, but I am not sure about
the dependencies. :-D

Thanks.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net] Panic when tc_lookup_action_n finds a partially initialized action.
  2016-10-11  9:28       ` Krister Johansen
@ 2016-10-11 17:51         ` Cong Wang
  0 siblings, 0 replies; 14+ messages in thread
From: Cong Wang @ 2016-10-11 17:51 UTC (permalink / raw)
  To: Krister Johansen; +Cc: Jamal Hadi Salim, Linux Kernel Network Developers

On Tue, Oct 11, 2016 at 2:28 AM, Krister Johansen
<kjlx@templeofstupid.com> wrote:
> On Wed, Oct 05, 2016 at 11:01:38AM -0700, Cong Wang wrote:
>> Does the attached patch make any sense now? Our pernet init doesn't
>> rely on act_base, so even we have some race, the worst case is after
>> we initialize the pernet netns for an action but its ops still not
>> visible, which seems fine (at least no crash).
>
> I tried to reproduce the panic with this latest patch, but I am unable
> to do so.  The one difference I notice between this patch, and the one I

Nice, so the crash is fixed. I will send out my patch formally.

> sent to the list, is that with yours it takes much longer before we get
> any output from the simultaneous launch of these containers.  Presumably
> that's the extra latency added by allowing many extra modprobe calls to
> get spawned by request_module().

This is a different problem. When we register a pernet ops, the ops->init()
will be called on each container to initialize its pernet data structures,
this is why request_module() blocks on waiting for register_pernet_subsys()
to finish. As we discussed earlier, this could be solved or workaround by
loading modules prior to creating containers. Or if this really needs to be
fixed, it should be in register_pernet_subsys(), since it is not specific to
tc actions, we have so many places loading modules at run-time in
networking subsystem.

Thanks for testing!

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2016-10-11 18:03 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-10-02  3:13 [PATCH net] Panic when tc_lookup_action_n finds a partially initialized action Krister Johansen
2016-10-03  1:18 ` Jamal Hadi Salim
2016-10-04  6:38   ` Krister Johansen
2016-10-03 18:22 ` Cong Wang
2016-10-04  6:39   ` Krister Johansen
2016-10-05  6:52   ` Krister Johansen
2016-10-05 18:01     ` Cong Wang
2016-10-05 18:07       ` Cong Wang
2016-10-06  6:11       ` Krister Johansen
2016-10-06 19:01         ` Cong Wang
2016-10-09  6:13           ` Krister Johansen
2016-10-11 17:36             ` Cong Wang
2016-10-11  9:28       ` Krister Johansen
2016-10-11 17:51         ` Cong Wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).