linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch] fix netconsole hang with alt-sysrq-t
@ 2004-08-06 19:29 Jeff Moyer
  2004-08-06 19:52 ` Matt Mackall
  0 siblings, 1 reply; 11+ messages in thread
From: Jeff Moyer @ 2004-08-06 19:29 UTC (permalink / raw)
  To: mpm; +Cc: linux-kernel

Hi, Matt,

Here's the patch.  Sorry it took me so long, been busy with other work.
Two things which need perhaps more thinking, can netpoll_poll be called
recursively (it didn't look like it to me), and do we care about the racy
nature of the netpoll_set_trap interface?

You'll notice that I reverted part of an earlier changeset which caused us
to call the hard_start_xmit function even if netif_queue_stopped returned
true.  This is a bug.  I preserved the second part of that patch, which was
correct.

I've also bumped the budget from 1 to 16.  As I mentioned, this was a
required change for netdump.

This patch was tested on my dual hammer test system.

Comments welcome.

-Jeff

--- linux-2.6.7/include/linux/netpoll.h.orig	2004-08-06 11:14:11.735851056 -0400
+++ linux-2.6.7/include/linux/netpoll.h	2004-08-06 11:14:33.500542320 -0400
@@ -21,6 +21,7 @@
 	u16 local_port, remote_port;
 	unsigned char local_mac[6], remote_mac[6];
 	struct list_head rx_list;
+	spinlock_t poll_lock;
 };
 
 void netpoll_poll(struct netpoll *np);
--- linux-2.6.7/include/linux/netdevice.h.orig	2004-08-06 13:01:39.438651240 -0400
+++ linux-2.6.7/include/linux/netdevice.h	2004-08-06 13:01:41.414350888 -0400
@@ -462,7 +462,7 @@
 						     unsigned char *haddr);
 	int			(*neigh_setup)(struct net_device *dev, struct neigh_parms *);
 	int			(*accept_fastpath)(struct net_device *, struct dst_entry*);
-#ifdef CONFIG_NETPOLL_RX
+#ifdef CONFIG_NETPOLL
 	int			netpoll_rx;
 #endif
 #ifdef CONFIG_NET_POLL_CONTROLLER
--- linux-2.6.7/net/core/netpoll.c.orig	2004-08-06 11:13:45.230880424 -0400
+++ linux-2.6.7/net/core/netpoll.c	2004-08-06 15:15:14.154229272 -0400
@@ -61,7 +61,8 @@
 
 void netpoll_poll(struct netpoll *np)
 {
-	int budget = 1;
+	int budget = 16;
+	unsigned long flags;
 
 	if(!np->dev || !netif_running(np->dev) || !np->dev->poll_controller)
 		return;
@@ -70,9 +71,21 @@
 	np->dev->poll_controller(np->dev);
 
 	/* If scheduling is stopped, tickle NAPI bits */
-	if(trapped && np->dev->poll &&
-	   test_bit(__LINK_STATE_RX_SCHED, &np->dev->state))
-		np->dev->poll(np->dev, &budget);
+	spin_lock_irqsave(&np->poll_lock, flags);
+	if (np->dev->poll &&
+	    test_bit(__LINK_STATE_RX_SCHED, &np->dev->state)) {
+		np->dev->netpoll_rx |= NETPOLL_RX_DROP;
+		if (trapped)
+			np->dev->poll(np->dev, &budget);
+		else {
+			trapped = 1;
+			np->dev->poll(np->dev, &budget);
+			trapped = 0;
+		}
+		np->dev->netpoll_rx &= ~NETPOLL_RX_DROP;
+	}
+	spin_unlock_irqrestore(&np->poll_lock, flags);
+
 	zap_completion_queue();
 }
 
@@ -168,6 +181,14 @@
 	spin_lock(&np->dev->xmit_lock);
 	np->dev->xmit_lock_owner = smp_processor_id();
 
+	if (netif_queue_stopped(np->dev)) {
+		np->dev->xmit_lock_owner = -1;
+		spin_unlock(&np->dev->xmit_lock);
+
+		netpoll_poll(np);
+		goto repeat;
+	}
+
 	status = np->dev->hard_start_xmit(skb, np->dev);
 	np->dev->xmit_lock_owner = -1;
 	spin_unlock(&np->dev->xmit_lock);
@@ -587,13 +608,12 @@
 	}
 
 	np->dev = ndev;
+	spin_lock_init(&np->poll_lock);
 
 	if(np->rx_hook) {
 		unsigned long flags;
 
-#ifdef CONFIG_NETPOLL_RX
-		np->dev->netpoll_rx = 1;
-#endif
+		np->dev->netpoll_rx = NETPOLL_RX_ENABLED;
 
 		spin_lock_irqsave(&rx_list_lock, flags);
 		list_add(&np->rx_list, &rx_list);
@@ -613,12 +633,10 @@
 
 		spin_lock_irqsave(&rx_list_lock, flags);
 		list_del(&np->rx_list);
-#ifdef CONFIG_NETPOLL_RX
-		np->dev->netpoll_rx = 0;
-#endif
 		spin_unlock_irqrestore(&rx_list_lock, flags);
 	}
 
+	np->dev->netpoll_rx = 0;
 	dev_put(np->dev);
 	np->dev = NULL;
 }
@@ -628,6 +646,7 @@
 	return trapped;
 }
 
+/* this interface is inherently racy.  do we care?  -phro */
 void netpoll_set_trap(int trap)
 {
 	trapped = trap;
--- linux-2.6.7/net/core/dev.c.orig	2004-08-06 11:13:51.237967208 -0400
+++ linux-2.6.7/net/core/dev.c	2004-08-06 13:26:28.246318072 -0400
@@ -1601,7 +1601,7 @@
 	struct softnet_data *queue;
 	unsigned long flags;
 
-#ifdef CONFIG_NETPOLL_RX
+#ifdef CONFIG_NETPOLL
 	if (skb->dev->netpoll_rx && netpoll_rx(skb)) {
 		kfree_skb(skb);
 		return NET_RX_DROP;
@@ -1805,7 +1805,7 @@
 	int ret = NET_RX_DROP;
 	unsigned short type;
 
-#ifdef CONFIG_NETPOLL_RX
+#ifdef CONFIG_NETPOLL
 	if (skb->dev->netpoll_rx && skb->dev->poll && netpoll_rx(skb)) {
 		kfree_skb(skb);
 		return NET_RX_DROP;
--- linux-2.6.7/net/Kconfig.orig	2004-08-06 13:09:21.543400640 -0400
+++ linux-2.6.7/net/Kconfig	2004-08-06 13:09:24.042020792 -0400
@@ -656,9 +656,6 @@
 config NETPOLL
 	def_bool NETCONSOLE || KGDBOE
 
-config NETPOLL_RX
-	def_bool KGDBOE
-
 config NETPOLL_TRAP
 	def_bool KGDBOE
 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] fix netconsole hang with alt-sysrq-t
  2004-08-06 19:29 [patch] fix netconsole hang with alt-sysrq-t Jeff Moyer
@ 2004-08-06 19:52 ` Matt Mackall
  2004-08-06 20:01   ` Jeff Moyer
  0 siblings, 1 reply; 11+ messages in thread
From: Matt Mackall @ 2004-08-06 19:52 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-kernel, Stelian Pop

On Fri, Aug 06, 2004 at 03:29:27PM -0400, Jeff Moyer wrote:
> Hi, Matt,
> 
> Here's the patch.  Sorry it took me so long, been busy with other work.
> Two things which need perhaps more thinking, can netpoll_poll be called
> recursively (it didn't look like it to me)

It can if the poll function does a printk or the like and wants to
recurse via netconsole. We could short-circuit that with an in_netpoll
flag, but let's worry about that separately.

> and do we care about the racy
> nature of the netpoll_set_trap interface?

That should probably become an atomic now.
 
> You'll notice that I reverted part of an earlier changeset which caused us
> to call the hard_start_xmit function even if netif_queue_stopped returned
> true.  This is a bug.  I preserved the second part of that patch, which was
> correct.

Ok, jgarzik pointed that out to me just a bit ago. I'm not sure if
we're dealing with the behavior that was intended to address yet
though. Stelian, can you try giving this a spin?

> I've also bumped the budget from 1 to 16.  As I mentioned, this was a
> required change for netdump.

Should be fine.

> This patch was tested on my dual hammer test system.

I'll have to re-rig my kgdb-over-ethernet test setup to test this, but
it looks good for now.

> -Jeff
> 
> --- linux-2.6.7/include/linux/netpoll.h.orig	2004-08-06 11:14:11.735851056 -0400
> +++ linux-2.6.7/include/linux/netpoll.h	2004-08-06 11:14:33.500542320 -0400
> @@ -21,6 +21,7 @@
>  	u16 local_port, remote_port;
>  	unsigned char local_mac[6], remote_mac[6];
>  	struct list_head rx_list;
> +	spinlock_t poll_lock;
>  };
>  
>  void netpoll_poll(struct netpoll *np);
> --- linux-2.6.7/include/linux/netdevice.h.orig	2004-08-06 13:01:39.438651240 -0400
> +++ linux-2.6.7/include/linux/netdevice.h	2004-08-06 13:01:41.414350888 -0400
> @@ -462,7 +462,7 @@
>  						     unsigned char *haddr);
>  	int			(*neigh_setup)(struct net_device *dev, struct neigh_parms *);
>  	int			(*accept_fastpath)(struct net_device *, struct dst_entry*);
> -#ifdef CONFIG_NETPOLL_RX
> +#ifdef CONFIG_NETPOLL
>  	int			netpoll_rx;
>  #endif
>  #ifdef CONFIG_NET_POLL_CONTROLLER
> --- linux-2.6.7/net/core/netpoll.c.orig	2004-08-06 11:13:45.230880424 -0400
> +++ linux-2.6.7/net/core/netpoll.c	2004-08-06 15:15:14.154229272 -0400
> @@ -61,7 +61,8 @@
>  
>  void netpoll_poll(struct netpoll *np)
>  {
> -	int budget = 1;
> +	int budget = 16;
> +	unsigned long flags;
>  
>  	if(!np->dev || !netif_running(np->dev) || !np->dev->poll_controller)
>  		return;
> @@ -70,9 +71,21 @@
>  	np->dev->poll_controller(np->dev);
>  
>  	/* If scheduling is stopped, tickle NAPI bits */
> -	if(trapped && np->dev->poll &&
> -	   test_bit(__LINK_STATE_RX_SCHED, &np->dev->state))
> -		np->dev->poll(np->dev, &budget);
> +	spin_lock_irqsave(&np->poll_lock, flags);
> +	if (np->dev->poll &&
> +	    test_bit(__LINK_STATE_RX_SCHED, &np->dev->state)) {
> +		np->dev->netpoll_rx |= NETPOLL_RX_DROP;
> +		if (trapped)
> +			np->dev->poll(np->dev, &budget);
> +		else {
> +			trapped = 1;
> +			np->dev->poll(np->dev, &budget);
> +			trapped = 0;
> +		}
> +		np->dev->netpoll_rx &= ~NETPOLL_RX_DROP;
> +	}
> +	spin_unlock_irqrestore(&np->poll_lock, flags);
> +
>  	zap_completion_queue();
>  }
>  
> @@ -168,6 +181,14 @@
>  	spin_lock(&np->dev->xmit_lock);
>  	np->dev->xmit_lock_owner = smp_processor_id();
>  
> +	if (netif_queue_stopped(np->dev)) {
> +		np->dev->xmit_lock_owner = -1;
> +		spin_unlock(&np->dev->xmit_lock);
> +
> +		netpoll_poll(np);
> +		goto repeat;
> +	}
> +
>  	status = np->dev->hard_start_xmit(skb, np->dev);
>  	np->dev->xmit_lock_owner = -1;
>  	spin_unlock(&np->dev->xmit_lock);
> @@ -587,13 +608,12 @@
>  	}
>  
>  	np->dev = ndev;
> +	spin_lock_init(&np->poll_lock);
>  
>  	if(np->rx_hook) {
>  		unsigned long flags;
>  
> -#ifdef CONFIG_NETPOLL_RX
> -		np->dev->netpoll_rx = 1;
> -#endif
> +		np->dev->netpoll_rx = NETPOLL_RX_ENABLED;
>  
>  		spin_lock_irqsave(&rx_list_lock, flags);
>  		list_add(&np->rx_list, &rx_list);
> @@ -613,12 +633,10 @@
>  
>  		spin_lock_irqsave(&rx_list_lock, flags);
>  		list_del(&np->rx_list);
> -#ifdef CONFIG_NETPOLL_RX
> -		np->dev->netpoll_rx = 0;
> -#endif
>  		spin_unlock_irqrestore(&rx_list_lock, flags);
>  	}
>  
> +	np->dev->netpoll_rx = 0;
>  	dev_put(np->dev);
>  	np->dev = NULL;
>  }
> @@ -628,6 +646,7 @@
>  	return trapped;
>  }
>  
> +/* this interface is inherently racy.  do we care?  -phro */
>  void netpoll_set_trap(int trap)
>  {
>  	trapped = trap;
> --- linux-2.6.7/net/core/dev.c.orig	2004-08-06 11:13:51.237967208 -0400
> +++ linux-2.6.7/net/core/dev.c	2004-08-06 13:26:28.246318072 -0400
> @@ -1601,7 +1601,7 @@
>  	struct softnet_data *queue;
>  	unsigned long flags;
>  
> -#ifdef CONFIG_NETPOLL_RX
> +#ifdef CONFIG_NETPOLL
>  	if (skb->dev->netpoll_rx && netpoll_rx(skb)) {
>  		kfree_skb(skb);
>  		return NET_RX_DROP;
> @@ -1805,7 +1805,7 @@
>  	int ret = NET_RX_DROP;
>  	unsigned short type;
>  
> -#ifdef CONFIG_NETPOLL_RX
> +#ifdef CONFIG_NETPOLL
>  	if (skb->dev->netpoll_rx && skb->dev->poll && netpoll_rx(skb)) {
>  		kfree_skb(skb);
>  		return NET_RX_DROP;
> --- linux-2.6.7/net/Kconfig.orig	2004-08-06 13:09:21.543400640 -0400
> +++ linux-2.6.7/net/Kconfig	2004-08-06 13:09:24.042020792 -0400
> @@ -656,9 +656,6 @@
>  config NETPOLL
>  	def_bool NETCONSOLE || KGDBOE
>  
> -config NETPOLL_RX
> -	def_bool KGDBOE
> -
>  config NETPOLL_TRAP
>  	def_bool KGDBOE
>  

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] fix netconsole hang with alt-sysrq-t
  2004-08-06 19:52 ` Matt Mackall
@ 2004-08-06 20:01   ` Jeff Moyer
  2004-08-06 20:26     ` Matt Mackall
  0 siblings, 1 reply; 11+ messages in thread
From: Jeff Moyer @ 2004-08-06 20:01 UTC (permalink / raw)
  To: Matt Mackall; +Cc: linux-kernel, Stelian Pop

==> Regarding Re: [patch] fix netconsole hang with alt-sysrq-t; Matt Mackall <mpm@selenic.com> adds:

mpm> On Fri, Aug 06, 2004 at 03:29:27PM -0400, Jeff Moyer wrote:
>> Hi, Matt,
>> 
>> Here's the patch.  Sorry it took me so long, been busy with other work.
>> Two things which need perhaps more thinking, can netpoll_poll be called
>> recursively (it didn't look like it to me)

mpm> It can if the poll function does a printk or the like and wants to
mpm> recurse via netconsole. We could short-circuit that with an in_netpoll
mpm> flag, but let's worry about that separately.

Hmm, ok.

>> and do we care about the racy
>> nature of the netpoll_set_trap interface?

mpm> That should probably become an atomic now.
 
Ouch.  I wanted to avoid that, but if we can't, we can't.  Will
netpoll_set_trap then to an atomic_inc or an atomic_add?  I've only seen it
called with 1 and 0, is that all that was intended?

>> You'll notice that I reverted part of an earlier changeset which caused us
>> to call the hard_start_xmit function even if netif_queue_stopped returned
>> true.  This is a bug.  I preserved the second part of that patch, which was
>> correct.

mpm> Ok, jgarzik pointed that out to me just a bit ago. I'm not sure if
mpm> we're dealing with the behavior that was intended to address yet
mpm> though. Stelian, can you try giving this a spin?

Well, we kept the second part of the patch, which deals with the
hard_start_xmit routine returning 1.  That was a valid bug, I believe.

>> I've also bumped the budget from 1 to 16.  As I mentioned, this was a
>> required change for netdump.

mpm> Should be fine.

>> This patch was tested on my dual hammer test system.

mpm> I'll have to re-rig my kgdb-over-ethernet test setup to test this, but
mpm> it looks good for now.

Yah, and I just noticed we don't want the poll_lock to be per struct
netpoll.  It should be a static lock in the netpoll.c file.  The problem is
that more than one netpoll object can reference the same ethernet device.

Thanks,

Jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] fix netconsole hang with alt-sysrq-t
  2004-08-06 20:01   ` Jeff Moyer
@ 2004-08-06 20:26     ` Matt Mackall
  2004-08-12 21:01       ` Jeff Moyer
  0 siblings, 1 reply; 11+ messages in thread
From: Matt Mackall @ 2004-08-06 20:26 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-kernel, Stelian Pop

On Fri, Aug 06, 2004 at 04:01:35PM -0400, Jeff Moyer wrote:
> ==> Regarding Re: [patch] fix netconsole hang with alt-sysrq-t; Matt Mackall <mpm@selenic.com> adds:
> 
> mpm> On Fri, Aug 06, 2004 at 03:29:27PM -0400, Jeff Moyer wrote:
> >> Hi, Matt,
> >> 
> >> Here's the patch.  Sorry it took me so long, been busy with other work.
> >> Two things which need perhaps more thinking, can netpoll_poll be called
> >> recursively (it didn't look like it to me)
> 
> mpm> It can if the poll function does a printk or the like and wants to
> mpm> recurse via netconsole. We could short-circuit that with an in_netpoll
> mpm> flag, but let's worry about that separately.
> 
> Hmm, ok.
> 
> >> and do we care about the racy
> >> nature of the netpoll_set_trap interface?
> 
> mpm> That should probably become an atomic now.
>  
> Ouch.  I wanted to avoid that, but if we can't, we can't.  Will
> netpoll_set_trap then to an atomic_inc or an atomic_add?  I've only seen it
> called with 1 and 0, is that all that was intended?

It's a boolean interface. We might switch from set(bool) to
enable()/disable(). More thought required.
 
> >> You'll notice that I reverted part of an earlier changeset which caused us
> >> to call the hard_start_xmit function even if netif_queue_stopped returned
> >> true.  This is a bug.  I preserved the second part of that patch, which was
> >> correct.
> 
> mpm> Ok, jgarzik pointed that out to me just a bit ago. I'm not sure if
> mpm> we're dealing with the behavior that was intended to address yet
> mpm> though. Stelian, can you try giving this a spin?
> 
> Well, we kept the second part of the patch, which deals with the
> hard_start_xmit routine returning 1.  That was a valid bug, I believe.

Probably, but it's hairy enough that I'm not entirely convinced we've
solved the particular problem.

> Yah, and I just noticed we don't want the poll_lock to be per struct
> netpoll.  It should be a static lock in the netpoll.c file.  The problem is
> that more than one netpoll object can reference the same ethernet device.

Good catch. My original design stuck pointers to the netpoll objects
in the net device and then I switched to allowing multiples and didn't
fix that bit.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] fix netconsole hang with alt-sysrq-t
  2004-08-06 20:26     ` Matt Mackall
@ 2004-08-12 21:01       ` Jeff Moyer
  2004-08-12 21:18         ` Muli Ben-Yehuda
  2004-08-13  0:29         ` Matt Mackall
  0 siblings, 2 replies; 11+ messages in thread
From: Jeff Moyer @ 2004-08-12 21:01 UTC (permalink / raw)
  To: Matt Mackall; +Cc: linux-kernel, Stelian Pop, jgarzik

==> Regarding Re: [patch] fix netconsole hang with alt-sysrq-t; Matt Mackall <mpm@selenic.com> adds:

mpm> On Fri, Aug 06, 2004 at 04:01:35PM -0400, Jeff Moyer wrote:
>> ==> Regarding Re: [patch] fix netconsole hang with alt-sysrq-t; Matt
>> Mackall <mpm@selenic.com> adds:
>> 
mpm> On Fri, Aug 06, 2004 at 03:29:27PM -0400, Jeff Moyer wrote:
>> >> Hi, Matt,
>> >> 
>> >> Here's the patch.  Sorry it took me so long, been busy with other
>> work.  >> Two things which need perhaps more thinking, can netpoll_poll
>> be called >> recursively (it didn't look like it to me)
>> 
mpm> It can if the poll function does a printk or the like and wants to
mpm> recurse via netconsole. We could short-circuit that with an in_netpoll
mpm> flag, but let's worry about that separately.

So how do you want to deal with this case?  We could do something like:

	int cpu = smp_processor_id();

	local_irq_save(flags);
	if (!spin_trylock(&netpoll_poll_lock)) {
		/* allow recursive calls on this cpu */
		if (cpu != poll_owner)
			spin_lock(&netpoll_poll_lock);
	}
	poll_owner = cpu;

	...

	poll_owner = -1;
	spin_unlock(&netpoll_poll_lock);
	local_irq_restore(flags);

>> >> and do we care about the racy >> nature of the netpoll_set_trap
>> interface?
>> 
mpm> That should probably become an atomic now.
>> Ouch.  I wanted to avoid that, but if we can't, we can't.  Will
>> netpoll_set_trap then to an atomic_inc or an atomic_add?  I've only seen
>> it called with 1 and 0, is that all that was intended?

mpm> It's a boolean interface. We might switch from set(bool) to
mpm> enable()/disable(). More thought required.
 
>> >> You'll notice that I reverted part of an earlier changeset which
>> caused us >> to call the hard_start_xmit function even if
>> netif_queue_stopped returned >> true.  This is a bug.  I preserved the
>> second part of that patch, which was >> correct.
>> 
mpm> Ok, jgarzik pointed that out to me just a bit ago. I'm not sure if
mpm> we're dealing with the behavior that was intended to address yet
mpm> though. Stelian, can you try giving this a spin?
>> Well, we kept the second part of the patch, which deals with the
>> hard_start_xmit routine returning 1.  That was a valid bug, I believe.

mpm> Probably, but it's hairy enough that I'm not entirely convinced we've
mpm> solved the particular problem.

>> Yah, and I just noticed we don't want the poll_lock to be per struct
>> netpoll.  It should be a static lock in the netpoll.c file.  The problem
>> is that more than one netpoll object can reference the same ethernet
>> device.

mpm> Good catch. My original design stuck pointers to the netpoll objects
mpm> in the net device and then I switched to allowing multiples and didn't
mpm> fix that bit.

Okay, Matt, here is another take which converts trapped to an atomic, and
moves the poll_lock out of the netdevice structure.  I'm keeping the change
to check for netif_queue_stopped as things are not happy without that
patch.  Jeff, would you mind commenting on why that is correct, please?
It's this hunk here:

@@ -168,6 +180,14 @@
 	spin_lock(&np->dev->xmit_lock);
 	np->dev->xmit_lock_owner = smp_processor_id();
 
+	if (netif_queue_stopped(np->dev)) {
+		np->dev->xmit_lock_owner = -1;
+		spin_unlock(&np->dev->xmit_lock);
+
+		netpoll_poll(np);
+		goto repeat;
+	}
+

Without this test, we would go ahead and call hard_start_xmit even though
the queue was stopped.

Thanks!

Jeff

--- linux-2.6.7/include/linux/netdevice.h.orig	2004-08-06 13:01:39.000000000 -0400
+++ linux-2.6.7/include/linux/netdevice.h	2004-08-06 13:01:41.000000000 -0400
@@ -462,7 +462,7 @@
 						     unsigned char *haddr);
 	int			(*neigh_setup)(struct net_device *dev, struct neigh_parms *);
 	int			(*accept_fastpath)(struct net_device *, struct dst_entry*);
-#ifdef CONFIG_NETPOLL_RX
+#ifdef CONFIG_NETPOLL
 	int			netpoll_rx;
 #endif
 #ifdef CONFIG_NET_POLL_CONTROLLER
--- linux-2.6.7/net/core/netpoll.c.orig	2004-08-06 11:13:45.000000000 -0400
+++ linux-2.6.7/net/core/netpoll.c	2004-08-12 16:32:04.151624208 -0400
@@ -36,7 +36,11 @@
 static spinlock_t rx_list_lock = SPIN_LOCK_UNLOCKED;
 static LIST_HEAD(rx_list);
 
-static int trapped;
+static atomic_t trapped;
+spinlock_t netpoll_poll_lock = SPIN_LOCK_UNLOCKED;
+
+#define NETPOLL_RX_ENABLED  1
+#define NETPOLL_RX_DROP     2
 
 #define MAX_SKB_SIZE \
 		(MAX_UDP_CHUNK + sizeof(struct udphdr) + \
@@ -61,7 +65,8 @@
 
 void netpoll_poll(struct netpoll *np)
 {
-	int budget = 1;
+	int budget = 16;
+	unsigned long flags;
 
 	if(!np->dev || !netif_running(np->dev) || !np->dev->poll_controller)
 		return;
@@ -70,9 +75,19 @@
 	np->dev->poll_controller(np->dev);
 
 	/* If scheduling is stopped, tickle NAPI bits */
-	if(trapped && np->dev->poll &&
-	   test_bit(__LINK_STATE_RX_SCHED, &np->dev->state))
+	spin_lock_irqsave(&netpoll_poll_lock, flags);
+	if (np->dev->poll &&
+	    test_bit(__LINK_STATE_RX_SCHED, &np->dev->state)) {
+		np->dev->netpoll_rx |= NETPOLL_RX_DROP;
+		atomic_inc(&trapped);
+
 		np->dev->poll(np->dev, &budget);
+
+		atomic_dec(&trapped);
+		np->dev->netpoll_rx &= ~NETPOLL_RX_DROP;
+	}
+	spin_unlock_irqrestore(&netpoll_poll_lock, flags);
+
 	zap_completion_queue();
 }
 
@@ -168,6 +183,14 @@
 	spin_lock(&np->dev->xmit_lock);
 	np->dev->xmit_lock_owner = smp_processor_id();
 
+	if (netif_queue_stopped(np->dev)) {
+		np->dev->xmit_lock_owner = -1;
+		spin_unlock(&np->dev->xmit_lock);
+
+		netpoll_poll(np);
+		goto repeat;
+	}
+
 	status = np->dev->hard_start_xmit(skb, np->dev);
 	np->dev->xmit_lock_owner = -1;
 	spin_unlock(&np->dev->xmit_lock);
@@ -337,7 +360,8 @@
 		goto out;
 
 	/* check if netpoll clients need ARP */
-	if (skb->protocol == __constant_htons(ETH_P_ARP) && trapped) {
+	if (skb->protocol == __constant_htons(ETH_P_ARP) && 
+	    atomic_read(&trapped)) {
 		arp_reply(skb);
 		return 1;
 	}
@@ -400,7 +424,7 @@
 	spin_unlock_irqrestore(&rx_list_lock, flags);
 
 out:
-	return trapped;
+	return atomic_read(&trapped);
 }
 
 int netpoll_parse_options(struct netpoll *np, char *opt)
@@ -591,9 +615,7 @@
 	if(np->rx_hook) {
 		unsigned long flags;
 
-#ifdef CONFIG_NETPOLL_RX
-		np->dev->netpoll_rx = 1;
-#endif
+		np->dev->netpoll_rx = NETPOLL_RX_ENABLED;
 
 		spin_lock_irqsave(&rx_list_lock, flags);
 		list_add(&np->rx_list, &rx_list);
@@ -613,24 +635,25 @@
 
 		spin_lock_irqsave(&rx_list_lock, flags);
 		list_del(&np->rx_list);
-#ifdef CONFIG_NETPOLL_RX
-		np->dev->netpoll_rx = 0;
-#endif
 		spin_unlock_irqrestore(&rx_list_lock, flags);
 	}
 
+	np->dev->netpoll_rx = 0;
 	dev_put(np->dev);
 	np->dev = NULL;
 }
 
 int netpoll_trap(void)
 {
-	return trapped;
+	return atomic_read(&trapped);
 }
 
 void netpoll_set_trap(int trap)
 {
-	trapped = trap;
+	if (trap)
+		atomic_inc(&trapped);
+	else
+		atomic_dec(&trapped);
 }
 
 EXPORT_SYMBOL(netpoll_set_trap);
--- linux-2.6.7/net/core/dev.c.orig	2004-08-06 11:13:51.000000000 -0400
+++ linux-2.6.7/net/core/dev.c	2004-08-06 13:26:28.000000000 -0400
@@ -1601,7 +1601,7 @@
 	struct softnet_data *queue;
 	unsigned long flags;
 
-#ifdef CONFIG_NETPOLL_RX
+#ifdef CONFIG_NETPOLL
 	if (skb->dev->netpoll_rx && netpoll_rx(skb)) {
 		kfree_skb(skb);
 		return NET_RX_DROP;
@@ -1805,7 +1805,7 @@
 	int ret = NET_RX_DROP;
 	unsigned short type;
 
-#ifdef CONFIG_NETPOLL_RX
+#ifdef CONFIG_NETPOLL
 	if (skb->dev->netpoll_rx && skb->dev->poll && netpoll_rx(skb)) {
 		kfree_skb(skb);
 		return NET_RX_DROP;
--- linux-2.6.7/net/Kconfig.orig	2004-08-06 13:09:21.000000000 -0400
+++ linux-2.6.7/net/Kconfig	2004-08-06 13:09:24.000000000 -0400
@@ -656,9 +656,6 @@
 config NETPOLL
 	def_bool NETCONSOLE || KGDBOE
 
-config NETPOLL_RX
-	def_bool KGDBOE
-
 config NETPOLL_TRAP
 	def_bool KGDBOE
 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] fix netconsole hang with alt-sysrq-t
  2004-08-12 21:01       ` Jeff Moyer
@ 2004-08-12 21:18         ` Muli Ben-Yehuda
  2004-08-12 21:32           ` Jeff Moyer
  2004-08-13  0:29         ` Matt Mackall
  1 sibling, 1 reply; 11+ messages in thread
From: Muli Ben-Yehuda @ 2004-08-12 21:18 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: Matt Mackall, linux-kernel, Stelian Pop, jgarzik

[-- Attachment #1: Type: text/plain, Size: 679 bytes --]

On Thu, Aug 12, 2004 at 05:01:18PM -0400, Jeff Moyer wrote:

> So how do you want to deal with this case?  We could do something like:
> 
> 	int cpu = smp_processor_id();

That doesn't look right, unless I'm missing something, you could get
preempted here (between the smp_processor_id() and the
local_irq_save() and end up with 'cpu' pointing to the wrong CPU.

> 	local_irq_save(flags);
> 	if (!spin_trylock(&netpoll_poll_lock)) {
> 		/* allow recursive calls on this cpu */
> 		if (cpu != poll_owner)
> 			spin_lock(&netpoll_poll_lock);
> 	}
> 	poll_owner = cpu;

Cheers, 
Muli
-- 
Muli Ben-Yehuda
http://www.mulix.org | http://mulix.livejournal.com/


[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] fix netconsole hang with alt-sysrq-t
  2004-08-12 21:18         ` Muli Ben-Yehuda
@ 2004-08-12 21:32           ` Jeff Moyer
  2004-08-12 21:39             ` Muli Ben-Yehuda
  0 siblings, 1 reply; 11+ messages in thread
From: Jeff Moyer @ 2004-08-12 21:32 UTC (permalink / raw)
  To: Muli Ben-Yehuda; +Cc: Matt Mackall, linux-kernel, Stelian Pop, jgarzik

==> Regarding Re: [patch] fix netconsole hang with alt-sysrq-t; Muli Ben-Yehuda <mulix@mulix.org> adds:

mulix> On Thu, Aug 12, 2004 at 05:01:18PM -0400, Jeff Moyer wrote:
>> So how do you want to deal with this case?  We could do something like:
>> 
>> int cpu = smp_processor_id();

mulix> That doesn't look right, unless I'm missing something, you could get
mulix> preempted here (between the smp_processor_id() and the
mulix> local_irq_save() and end up with 'cpu' pointing to the wrong CPU.

Would a preempt_disable() be too hideous?  Other suggestions?

-Jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] fix netconsole hang with alt-sysrq-t
  2004-08-12 21:32           ` Jeff Moyer
@ 2004-08-12 21:39             ` Muli Ben-Yehuda
  2004-08-13  0:21               ` Matt Mackall
  0 siblings, 1 reply; 11+ messages in thread
From: Muli Ben-Yehuda @ 2004-08-12 21:39 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: Matt Mackall, linux-kernel, Stelian Pop, jgarzik

[-- Attachment #1: Type: text/plain, Size: 812 bytes --]

On Thu, Aug 12, 2004 at 05:32:21PM -0400, Jeff Moyer wrote:
> ==> Regarding Re: [patch] fix netconsole hang with alt-sysrq-t; Muli Ben-Yehuda <mulix@mulix.org> adds:
> 
> mulix> On Thu, Aug 12, 2004 at 05:01:18PM -0400, Jeff Moyer wrote:
> >> So how do you want to deal with this case?  We could do something like:
> >> 
> >> int cpu = smp_processor_id();
> 
> mulix> That doesn't look right, unless I'm missing something, you could get
> mulix> preempted here (between the smp_processor_id() and the
> mulix> local_irq_save() and end up with 'cpu' pointing to the wrong CPU.
> 
> Would a preempt_disable() be too hideous?  Other suggestions?

Maybe, but we could hide it in get_cpu() / put_cpu() ;-)

Cheers, 
Muli
-- 
Muli Ben-Yehuda
http://www.mulix.org | http://mulix.livejournal.com/


[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] fix netconsole hang with alt-sysrq-t
  2004-08-12 21:39             ` Muli Ben-Yehuda
@ 2004-08-13  0:21               ` Matt Mackall
  0 siblings, 0 replies; 11+ messages in thread
From: Matt Mackall @ 2004-08-13  0:21 UTC (permalink / raw)
  To: Muli Ben-Yehuda; +Cc: Jeff Moyer, linux-kernel, Stelian Pop, jgarzik

On Fri, Aug 13, 2004 at 12:39:36AM +0300, Muli Ben-Yehuda wrote:
> On Thu, Aug 12, 2004 at 05:32:21PM -0400, Jeff Moyer wrote:
> > ==> Regarding Re: [patch] fix netconsole hang with alt-sysrq-t; Muli Ben-Yehuda <mulix@mulix.org> adds:
> > 
> > mulix> On Thu, Aug 12, 2004 at 05:01:18PM -0400, Jeff Moyer wrote:
> > >> So how do you want to deal with this case?  We could do something like:
> > >> 
> > >> int cpu = smp_processor_id();
> > 
> > mulix> That doesn't look right, unless I'm missing something, you could get
> > mulix> preempted here (between the smp_processor_id() and the
> > mulix> local_irq_save() and end up with 'cpu' pointing to the wrong CPU.
> > 
> > Would a preempt_disable() be too hideous?  Other suggestions?
> 
> Maybe, but we could hide it in get_cpu() / put_cpu() ;-)

Yes, let's. I'll have to think about this general approach a bit more though.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] fix netconsole hang with alt-sysrq-t
  2004-08-12 21:01       ` Jeff Moyer
  2004-08-12 21:18         ` Muli Ben-Yehuda
@ 2004-08-13  0:29         ` Matt Mackall
  2004-08-16 18:41           ` Jeff Moyer
  1 sibling, 1 reply; 11+ messages in thread
From: Matt Mackall @ 2004-08-13  0:29 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-kernel, Stelian Pop, jgarzik

On Thu, Aug 12, 2004 at 05:01:18PM -0400, Jeff Moyer wrote:
> ==> Regarding Re: [patch] fix netconsole hang with alt-sysrq-t; Matt Mackall <mpm@selenic.com> adds:
> 
> mpm> On Fri, Aug 06, 2004 at 04:01:35PM -0400, Jeff Moyer wrote:
> >> ==> Regarding Re: [patch] fix netconsole hang with alt-sysrq-t; Matt
> >> Mackall <mpm@selenic.com> adds:
> >> 
> mpm> On Fri, Aug 06, 2004 at 03:29:27PM -0400, Jeff Moyer wrote:
> >> >> Hi, Matt,
> >> >> 
> >> >> Here's the patch.  Sorry it took me so long, been busy with other
> >> work.  >> Two things which need perhaps more thinking, can netpoll_poll
> >> be called >> recursively (it didn't look like it to me)
> >> 
> mpm> It can if the poll function does a printk or the like and wants to
> mpm> recurse via netconsole. We could short-circuit that with an in_netpoll
> mpm> flag, but let's worry about that separately.

We've got about 5 different issues in this thread/patch, and they need to be
broken up. I was going to do this, but I'm moving to another city in 4
days. Jeff, if you'd be so kind (otherwise I'll get to it in about a
week and a half):

>  	spin_lock(&np->dev->xmit_lock);
>  	np->dev->xmit_lock_owner = smp_processor_id();
>  
> +	if (netif_queue_stopped(np->dev)) {
> +		np->dev->xmit_lock_owner = -1;
> +		spin_unlock(&np->dev->xmit_lock);
> +
> +		netpoll_poll(np);
> +		goto repeat;
> +	}
> +

Separate patch to revert this hunk with its own comment, please. This
should go in first.

> Without this test, we would go ahead and call hard_start_xmit even though
> the queue was stopped.
> 
> Thanks!
> 
> Jeff
> 
> --- linux-2.6.7/include/linux/netdevice.h.orig	2004-08-06 13:01:39.000000000 -0400
> +++ linux-2.6.7/include/linux/netdevice.h	2004-08-06 13:01:41.000000000 -0400
> @@ -462,7 +462,7 @@
>  						     unsigned char *haddr);
>  	int			(*neigh_setup)(struct net_device *dev, struct neigh_parms *);
>  	int			(*accept_fastpath)(struct net_device *, struct dst_entry*);
> -#ifdef CONFIG_NETPOLL_RX
> +#ifdef CONFIG_NETPOLL

And a separate bit to kill _RX

>  	int			netpoll_rx;
>  #endif
>  #ifdef CONFIG_NET_POLL_CONTROLLER
> --- linux-2.6.7/net/core/netpoll.c.orig	2004-08-06 11:13:45.000000000 -0400
> +++ linux-2.6.7/net/core/netpoll.c	2004-08-12 16:32:04.151624208 -0400
> @@ -36,7 +36,11 @@
>  static spinlock_t rx_list_lock = SPIN_LOCK_UNLOCKED;
>  static LIST_HEAD(rx_list);
>  
> -static int trapped;
> +static atomic_t trapped;

And one for making trapped atomic.

> +spinlock_t netpoll_poll_lock = SPIN_LOCK_UNLOCKED;

And one for globalizing the lock.

> +
> +#define NETPOLL_RX_ENABLED  1
> +#define NETPOLL_RX_DROP     2
>  
>  #define MAX_SKB_SIZE \
>  		(MAX_UDP_CHUNK + sizeof(struct udphdr) + \
> @@ -61,7 +65,8 @@
>  
>  void netpoll_poll(struct netpoll *np)
>  {
> -	int budget = 1;
> +	int budget = 16;

And this one-liner could use a comment and its own patch as well.

> +	unsigned long flags;
>  
>  	if(!np->dev || !netif_running(np->dev) || !np->dev->poll_controller)
>  		return;
> @@ -70,9 +75,19 @@
>  	np->dev->poll_controller(np->dev);
>  
>  	/* If scheduling is stopped, tickle NAPI bits */
> -	if(trapped && np->dev->poll &&
> -	   test_bit(__LINK_STATE_RX_SCHED, &np->dev->state))
> +	spin_lock_irqsave(&netpoll_poll_lock, flags);
> +	if (np->dev->poll &&
> +	    test_bit(__LINK_STATE_RX_SCHED, &np->dev->state)) {
> +		np->dev->netpoll_rx |= NETPOLL_RX_DROP;
> +		atomic_inc(&trapped);
> +
>  		np->dev->poll(np->dev, &budget);
> +
> +		atomic_dec(&trapped);
> +		np->dev->netpoll_rx &= ~NETPOLL_RX_DROP;
> +	}
> +	spin_unlock_irqrestore(&netpoll_poll_lock, flags);
> +
>  	zap_completion_queue();
>  }

And a new patch that brings in the new RX/trapped logic.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] fix netconsole hang with alt-sysrq-t
  2004-08-13  0:29         ` Matt Mackall
@ 2004-08-16 18:41           ` Jeff Moyer
  0 siblings, 0 replies; 11+ messages in thread
From: Jeff Moyer @ 2004-08-16 18:41 UTC (permalink / raw)
  To: Matt Mackall; +Cc: linux-kernel, Stelian Pop, jgarzik

==> Regarding Re: [patch] fix netconsole hang with alt-sysrq-t; Matt Mackall <mpm@selenic.com> adds:

mpm> On Thu, Aug 12, 2004 at 05:01:18PM -0400, Jeff Moyer wrote:
>> ==> Regarding Re: [patch] fix netconsole hang with alt-sysrq-t; Matt
>> Mackall <mpm@selenic.com> adds:
>> 
mpm> On Fri, Aug 06, 2004 at 04:01:35PM -0400, Jeff Moyer wrote:
>> >> ==> Regarding Re: [patch] fix netconsole hang with alt-sysrq-t; Matt
>> >> Mackall <mpm@selenic.com> adds:
>> >> 
mpm> On Fri, Aug 06, 2004 at 03:29:27PM -0400, Jeff Moyer wrote:
>> >> >> Hi, Matt,
>> >> >> 
>> >> >> Here's the patch.  Sorry it took me so long, been busy with other
>> >> work.  >> Two things which need perhaps more thinking, can
>> netpoll_poll >> be called >> recursively (it didn't look like it to me)
>> >> 
mpm> It can if the poll function does a printk or the like and wants to
mpm> recurse via netconsole. We could short-circuit that with an in_netpoll
mpm> flag, but let's worry about that separately.

mpm> We've got about 5 different issues in this thread/patch, and they need
mpm> to be broken up. I was going to do this, but I'm moving to another
mpm> city in 4 days. Jeff, if you'd be so kind (otherwise I'll get to it in
mpm> about a week and a half):

I'm all for splitting out patches.  However, I think this is a bit too fine
grained.  I'll separate out what makes sense to me.  If you want it split
out further, let me know and I'll see what I can do.  I'll send out the
patches in a short while.

-Jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2004-08-16 18:45 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-08-06 19:29 [patch] fix netconsole hang with alt-sysrq-t Jeff Moyer
2004-08-06 19:52 ` Matt Mackall
2004-08-06 20:01   ` Jeff Moyer
2004-08-06 20:26     ` Matt Mackall
2004-08-12 21:01       ` Jeff Moyer
2004-08-12 21:18         ` Muli Ben-Yehuda
2004-08-12 21:32           ` Jeff Moyer
2004-08-12 21:39             ` Muli Ben-Yehuda
2004-08-13  0:21               ` Matt Mackall
2004-08-13  0:29         ` Matt Mackall
2004-08-16 18:41           ` Jeff Moyer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).