All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net] atlantic: fix deadlock at aq_nic_stop
@ 2022-10-14 10:34 Íñigo Huguet
  2022-10-14 12:13 ` Andrew Lunn
  2022-10-20  7:53 ` [PATCH v2 " Íñigo Huguet
  0 siblings, 2 replies; 20+ messages in thread
From: Íñigo Huguet @ 2022-10-14 10:34 UTC (permalink / raw)
  To: irusskikh, dbogdanov
  Cc: davem, edumazet, kuba, pabeni, netdev, Íñigo Huguet, Li Liang

NIC is stopped with rtnl_lock held, and during the stop it cancels the
'service_task' work and free irqs.

However, if CONFIG_MACSEC is set, rtnl_lock is acquired both from
aq_nic_service_task and aq_linkstate_threaded_isr. Then a deadlock
happens if aq_nic_stop tries to cancel/disable them when they've already
started their execution.

As the deadlock is caused by rtnl_lock, it causes many other processes
to stall, not only atlantic related stuff.

Fix trying to acquire rtnl_lock at the beginning of those functions, and
returning if NIC closing is ongoing. Also do the "linkstate" stuff in a
workqueue instead than in a threaded irq, where sleeping or waiting a
mutex for a long time is discouraged.

The issue appeared repeteadly attaching and deattaching the NIC to a
bond interface. Doing that after this patch I cannot reproduce the bug.

Fixes: 62c1c2e606f6 ("net: atlantic: MACSec offload skeleton")
Reported-by: Li Liang <liali@redhat.com>
Signed-off-by: Íñigo Huguet <ihuguet@redhat.com>
---
 .../ethernet/aquantia/atlantic/aq_macsec.c    |  7 +--
 .../net/ethernet/aquantia/atlantic/aq_nic.c   | 59 ++++++++++++++++---
 .../net/ethernet/aquantia/atlantic/aq_nic.h   |  1 +
 3 files changed, 55 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c b/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c
index 3d0e16791e1c..5759eba89db9 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c
@@ -1458,7 +1458,7 @@ int aq_macsec_enable(struct aq_nic_s *nic)
 	if (!nic->macsec_cfg)
 		return 0;
 
-	rtnl_lock();
+	ASSERT_RTNL();
 
 	if (nic->aq_fw_ops->send_macsec_req) {
 		struct macsec_cfg_request cfg = { 0 };
@@ -1507,7 +1507,6 @@ int aq_macsec_enable(struct aq_nic_s *nic)
 	ret = aq_apply_macsec_cfg(nic);
 
 unlock:
-	rtnl_unlock();
 	return ret;
 }
 
@@ -1519,9 +1518,9 @@ void aq_macsec_work(struct aq_nic_s *nic)
 	if (!netif_carrier_ok(nic->ndev))
 		return;
 
-	rtnl_lock();
+	ASSERT_RTNL();
+
 	aq_check_txsa_expiration(nic);
-	rtnl_unlock();
 }
 
 int aq_macsec_rx_sa_cnt(struct aq_nic_s *nic)
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index 06508eebb585..5cb7d165dd21 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -40,6 +40,8 @@ static unsigned int aq_itr_rx;
 module_param_named(aq_itr_rx, aq_itr_rx, uint, 0644);
 MODULE_PARM_DESC(aq_itr_rx, "RX interrupt throttle rate");
 
+#define AQ_TASK_RETRY_MS	50
+
 static void aq_nic_update_ndev_stats(struct aq_nic_s *self);
 
 static void aq_nic_rss_init(struct aq_nic_s *self, unsigned int num_rss_queues)
@@ -210,19 +212,41 @@ static int aq_nic_update_link_status(struct aq_nic_s *self)
 	return 0;
 }
 
-static irqreturn_t aq_linkstate_threaded_isr(int irq, void *private)
+static irqreturn_t aq_linkstate_isr(int irq, void *private)
 {
 	struct aq_nic_s *self = private;
 
 	if (!self)
 		return IRQ_NONE;
 
+	if (!aq_utils_obj_test(&self->flags, AQ_NIC_FLAG_CLOSING))
+		aq_ndev_schedule_work(&self->linkstate_task);
+
+	return IRQ_HANDLED;
+}
+
+static void aq_nic_linkstate_task(struct work_struct *work)
+{
+	struct aq_nic_s *self = container_of(work, struct aq_nic_s,
+					     linkstate_task);
+
+#if IS_ENABLED(CONFIG_MACSEC)
+	/* avoid deadlock at aq_nic_stop -> cancel_work_sync */
+	while (!rtnl_trylock()) {
+		if (aq_utils_obj_test(&self->flags, AQ_NIC_FLAG_CLOSING))
+			return;
+		msleep(AQ_TASK_RETRY_MS);
+	}
+#endif
+
 	aq_nic_update_link_status(self);
 
+#if IS_ENABLED(CONFIG_MACSEC)
+	rtnl_unlock();
+#endif
+
 	self->aq_hw_ops->hw_irq_enable(self->aq_hw,
 				       BIT(self->aq_nic_cfg.link_irq_vec));
-
-	return IRQ_HANDLED;
 }
 
 static void aq_nic_service_task(struct work_struct *work)
@@ -236,12 +260,23 @@ static void aq_nic_service_task(struct work_struct *work)
 	if (aq_utils_obj_test(&self->flags, AQ_NIC_FLAGS_IS_NOT_READY))
 		return;
 
+#if IS_ENABLED(CONFIG_MACSEC)
+	/* avoid deadlock at aq_nic_stop -> cancel_work_sync */
+	while (!rtnl_trylock()) {
+		if (aq_utils_obj_test(&self->flags, AQ_NIC_FLAG_CLOSING))
+			return;
+		msleep(AQ_TASK_RETRY_MS);
+	}
+#endif
+
 	err = aq_nic_update_link_status(self);
 	if (err)
 		return;
 
 #if IS_ENABLED(CONFIG_MACSEC)
 	aq_macsec_work(self);
+
+	rtnl_unlock();
 #endif
 
 	mutex_lock(&self->fwreq_mutex);
@@ -505,6 +540,7 @@ int aq_nic_start(struct aq_nic_s *self)
 	if (err)
 		goto err_exit;
 
+	INIT_WORK(&self->linkstate_task, aq_nic_linkstate_task);
 	INIT_WORK(&self->service_task, aq_nic_service_task);
 
 	timer_setup(&self->service_timer, aq_nic_service_timer_cb, 0);
@@ -531,10 +567,9 @@ int aq_nic_start(struct aq_nic_s *self)
 		if (cfg->link_irq_vec) {
 			int irqvec = pci_irq_vector(self->pdev,
 						    cfg->link_irq_vec);
-			err = request_threaded_irq(irqvec, NULL,
-						   aq_linkstate_threaded_isr,
-						   IRQF_SHARED | IRQF_ONESHOT,
-						   self->ndev->name, self);
+			err = request_irq(irqvec, aq_linkstate_isr,
+					  IRQF_SHARED | IRQF_ONESHOT,
+					  self->ndev->name, self);
 			if (err < 0)
 				goto err_exit;
 			self->msix_entry_mask |= (1 << cfg->link_irq_vec);
@@ -1380,11 +1415,15 @@ int aq_nic_set_loopback(struct aq_nic_s *self)
 int aq_nic_stop(struct aq_nic_s *self)
 {
 	unsigned int i = 0U;
+	int ret;
+
+	aq_utils_obj_set(&self->flags, AQ_NIC_FLAG_CLOSING);
 
 	netif_tx_disable(self->ndev);
 	netif_carrier_off(self->ndev);
 
 	del_timer_sync(&self->service_timer);
+	cancel_work_sync(&self->linkstate_task);
 	cancel_work_sync(&self->service_task);
 
 	self->aq_hw_ops->hw_irq_disable(self->aq_hw, AQ_CFG_IRQ_MASK);
@@ -1401,7 +1440,11 @@ int aq_nic_stop(struct aq_nic_s *self)
 
 	aq_ptp_ring_stop(self);
 
-	return self->aq_hw_ops->hw_stop(self->aq_hw);
+	ret = self->aq_hw_ops->hw_stop(self->aq_hw);
+
+	aq_utils_obj_clear(&self->flags, AQ_NIC_FLAG_CLOSING);
+
+	return ret;
 }
 
 void aq_nic_set_power(struct aq_nic_s *self)
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
index 935ba889bd9a..a114b66990a9 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
@@ -140,6 +140,7 @@ struct aq_nic_s {
 	const struct aq_fw_ops *aq_fw_ops;
 	struct aq_nic_cfg_s aq_nic_cfg;
 	struct timer_list service_timer;
+	struct work_struct linkstate_task;
 	struct work_struct service_task;
 	struct timer_list polling_timer;
 	struct aq_hw_link_status_s link_status;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop
  2022-10-14 10:34 [PATCH net] atlantic: fix deadlock at aq_nic_stop Íñigo Huguet
@ 2022-10-14 12:13 ` Andrew Lunn
  2022-10-14 12:43   ` Íñigo Huguet
  2022-10-20  7:53 ` [PATCH v2 " Íñigo Huguet
  1 sibling, 1 reply; 20+ messages in thread
From: Andrew Lunn @ 2022-10-14 12:13 UTC (permalink / raw)
  To: Íñigo Huguet
  Cc: irusskikh, dbogdanov, davem, edumazet, kuba, pabeni, netdev, Li Liang

> Fix trying to acquire rtnl_lock at the beginning of those functions, and
> returning if NIC closing is ongoing. Also do the "linkstate" stuff in a
> workqueue instead than in a threaded irq, where sleeping or waiting a
> mutex for a long time is discouraged.

What happens when the same interrupt fires again, while the work queue
is still active? The advantage of the threaded interrupt handler is
that the interrupt will be kept disabled, and should not fire again
until the threaded interrupt handler exits.

> +static void aq_nic_linkstate_task(struct work_struct *work)
> +{
> +	struct aq_nic_s *self = container_of(work, struct aq_nic_s,
> +					     linkstate_task);
> +
> +#if IS_ENABLED(CONFIG_MACSEC)
> +	/* avoid deadlock at aq_nic_stop -> cancel_work_sync */
> +	while (!rtnl_trylock()) {
> +		if (aq_utils_obj_test(&self->flags, AQ_NIC_FLAG_CLOSING))
> +			return;
> +		msleep(AQ_TASK_RETRY_MS);
> +	}
> +#endif
> +
>  	aq_nic_update_link_status(self);
>  
> +#if IS_ENABLED(CONFIG_MACSEC)
> +	rtnl_unlock();
> +#endif
> +

If MACSEC is enabled, aq_nic_update_link_status() is called with RTNL
held. If it is not enabled, RTNL is not held. This sort of
inconsistency could lead to further locking bugs, since it is not
obvious. Please try to make this consistent.

	 Andrew

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop
  2022-10-14 12:13 ` Andrew Lunn
@ 2022-10-14 12:43   ` Íñigo Huguet
  2022-10-14 13:35     ` Andrew Lunn
  0 siblings, 1 reply; 20+ messages in thread
From: Íñigo Huguet @ 2022-10-14 12:43 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: irusskikh, dbogdanov, davem, edumazet, kuba, pabeni, netdev, Li Liang

On Fri, Oct 14, 2022 at 2:14 PM Andrew Lunn <andrew@lunn.ch> wrote:
>
> > Fix trying to acquire rtnl_lock at the beginning of those functions, and
> > returning if NIC closing is ongoing. Also do the "linkstate" stuff in a
> > workqueue instead than in a threaded irq, where sleeping or waiting a
> > mutex for a long time is discouraged.
>
> What happens when the same interrupt fires again, while the work queue
> is still active? The advantage of the threaded interrupt handler is
> that the interrupt will be kept disabled, and should not fire again
> until the threaded interrupt handler exits.

Nothing happens, if it's already queued, it won't be queued again, and
when it runs it will evaluate the last link state. And in the worst
case, it will be enqueued to run again, and if linkstate has changed
it will be evaluated again. This will rarely happen and it's harmless.

Also, I haven't checked it but these lines suggest that the IRQ is
auto-disabled in the hw until you enable it again. I didn't rely on
this, anyway.
        self->aq_hw_ops->hw_irq_enable(self->aq_hw,
                                       BIT(self->aq_nic_cfg.link_irq_vec));

Honestly I was a bit in doubt on doing this, with the threaded irq it
would also work. I'd like to hear more opinions about this and I can
change it back.

>
> > +static void aq_nic_linkstate_task(struct work_struct *work)
> > +{
> > +     struct aq_nic_s *self = container_of(work, struct aq_nic_s,
> > +                                          linkstate_task);
> > +
> > +#if IS_ENABLED(CONFIG_MACSEC)
> > +     /* avoid deadlock at aq_nic_stop -> cancel_work_sync */
> > +     while (!rtnl_trylock()) {
> > +             if (aq_utils_obj_test(&self->flags, AQ_NIC_FLAG_CLOSING))
> > +                     return;
> > +             msleep(AQ_TASK_RETRY_MS);
> > +     }
> > +#endif
> > +
> >       aq_nic_update_link_status(self);
> >
> > +#if IS_ENABLED(CONFIG_MACSEC)
> > +     rtnl_unlock();
> > +#endif
> > +
>
> If MACSEC is enabled, aq_nic_update_link_status() is called with RTNL
> held. If it is not enabled, RTNL is not held. This sort of
> inconsistency could lead to further locking bugs, since it is not
> obvious. Please try to make this consistent.

This is not new in these patches, that's what was already happening, I
just moved it to get the lock a bit earlier. In my opinion, this is as
it should be: why acquire a mutex if you don't have anything to
protect with it? And it's worse with rtnl_lock which is held by many
processes, and can be held for quite long times...

>
>          Andrew
>


-- 
Íñigo Huguet


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop
  2022-10-14 12:43   ` Íñigo Huguet
@ 2022-10-14 13:35     ` Andrew Lunn
  2022-10-14 13:44       ` Íñigo Huguet
  0 siblings, 1 reply; 20+ messages in thread
From: Andrew Lunn @ 2022-10-14 13:35 UTC (permalink / raw)
  To: Íñigo Huguet
  Cc: irusskikh, dbogdanov, davem, edumazet, kuba, pabeni, netdev, Li Liang

On Fri, Oct 14, 2022 at 02:43:47PM +0200, Íñigo Huguet wrote:
> On Fri, Oct 14, 2022 at 2:14 PM Andrew Lunn <andrew@lunn.ch> wrote:
> >
> > > Fix trying to acquire rtnl_lock at the beginning of those functions, and
> > > returning if NIC closing is ongoing. Also do the "linkstate" stuff in a
> > > workqueue instead than in a threaded irq, where sleeping or waiting a
> > > mutex for a long time is discouraged.
> >
> > What happens when the same interrupt fires again, while the work queue
> > is still active? The advantage of the threaded interrupt handler is
> > that the interrupt will be kept disabled, and should not fire again
> > until the threaded interrupt handler exits.
> 
> Nothing happens, if it's already queued, it won't be queued again, and
> when it runs it will evaluate the last link state. And in the worst
> case, it will be enqueued to run again, and if linkstate has changed
> it will be evaluated again. This will rarely happen and it's harmless.
> 
> Also, I haven't checked it but these lines suggest that the IRQ is
> auto-disabled in the hw until you enable it again. I didn't rely on
> this, anyway.
>         self->aq_hw_ops->hw_irq_enable(self->aq_hw,
>                                        BIT(self->aq_nic_cfg.link_irq_vec));
> 
> Honestly I was a bit in doubt on doing this, with the threaded irq it
> would also work. I'd like to hear more opinions about this and I can
> change it back.

Ethernet PHYs do all there interrupt handling in threaded IRQs. That
can require a number of MDIO transactions. So we can be talking about
64 bits at 2.5MHz, so 25uS or more. We have not seen issues with that.

> > If MACSEC is enabled, aq_nic_update_link_status() is called with RTNL
> > held. If it is not enabled, RTNL is not held. This sort of
> > inconsistency could lead to further locking bugs, since it is not
> > obvious. Please try to make this consistent.
> 
> This is not new in these patches, that's what was already happening, I
> just moved it to get the lock a bit earlier. In my opinion, this is as
> it should be: why acquire a mutex if you don't have anything to
> protect with it? And it's worse with rtnl_lock which is held by many
> processes, and can be held for quite long times...

Maybe the lock needs to be moved closer to what actually needs to be
protect? What is it protecting?

	 Andrew

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop
  2022-10-14 13:35     ` Andrew Lunn
@ 2022-10-14 13:44       ` Íñigo Huguet
  2022-10-15 15:09         ` Andrew Lunn
  0 siblings, 1 reply; 20+ messages in thread
From: Íñigo Huguet @ 2022-10-14 13:44 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: irusskikh, dbogdanov, davem, edumazet, kuba, pabeni, netdev, Li Liang

On Fri, Oct 14, 2022 at 3:35 PM Andrew Lunn <andrew@lunn.ch> wrote:
>
> On Fri, Oct 14, 2022 at 02:43:47PM +0200, Íñigo Huguet wrote:
> > On Fri, Oct 14, 2022 at 2:14 PM Andrew Lunn <andrew@lunn.ch> wrote:
> > >
> > > > Fix trying to acquire rtnl_lock at the beginning of those functions, and
> > > > returning if NIC closing is ongoing. Also do the "linkstate" stuff in a
> > > > workqueue instead than in a threaded irq, where sleeping or waiting a
> > > > mutex for a long time is discouraged.
> > >
> > > What happens when the same interrupt fires again, while the work queue
> > > is still active? The advantage of the threaded interrupt handler is
> > > that the interrupt will be kept disabled, and should not fire again
> > > until the threaded interrupt handler exits.
> >
> > Nothing happens, if it's already queued, it won't be queued again, and
> > when it runs it will evaluate the last link state. And in the worst
> > case, it will be enqueued to run again, and if linkstate has changed
> > it will be evaluated again. This will rarely happen and it's harmless.
> >
> > Also, I haven't checked it but these lines suggest that the IRQ is
> > auto-disabled in the hw until you enable it again. I didn't rely on
> > this, anyway.
> >         self->aq_hw_ops->hw_irq_enable(self->aq_hw,
> >                                        BIT(self->aq_nic_cfg.link_irq_vec));
> >
> > Honestly I was a bit in doubt on doing this, with the threaded irq it
> > would also work. I'd like to hear more opinions about this and I can
> > change it back.
>
> Ethernet PHYs do all there interrupt handling in threaded IRQs. That
> can require a number of MDIO transactions. So we can be talking about
> 64 bits at 2.5MHz, so 25uS or more. We have not seen issues with that.
>
> > > If MACSEC is enabled, aq_nic_update_link_status() is called with RTNL
> > > held. If it is not enabled, RTNL is not held. This sort of
> > > inconsistency could lead to further locking bugs, since it is not
> > > obvious. Please try to make this consistent.
> >
> > This is not new in these patches, that's what was already happening, I
> > just moved it to get the lock a bit earlier. In my opinion, this is as
> > it should be: why acquire a mutex if you don't have anything to
> > protect with it? And it's worse with rtnl_lock which is held by many
> > processes, and can be held for quite long times...
>
> Maybe the lock needs to be moved closer to what actually needs to be
> protect? What is it protecting?

It's protecting the operations of aq_macsec_enable and aq_macsec_work.
The locking was closer to them, but the idea of this patch is to move
the locking to an earlier moment so, in the case we need to abort, do
it before changing anything.

>
>          Andrew
>


-- 
Íñigo Huguet


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop
  2022-10-14 13:44       ` Íñigo Huguet
@ 2022-10-15 15:09         ` Andrew Lunn
  2022-10-17  7:22           ` Íñigo Huguet
  0 siblings, 1 reply; 20+ messages in thread
From: Andrew Lunn @ 2022-10-15 15:09 UTC (permalink / raw)
  To: Íñigo Huguet
  Cc: irusskikh, dbogdanov, davem, edumazet, kuba, pabeni, netdev, Li Liang

> > Maybe the lock needs to be moved closer to what actually needs to be
> > protect? What is it protecting?
> 
> It's protecting the operations of aq_macsec_enable and aq_macsec_work.
> The locking was closer to them, but the idea of this patch is to move
> the locking to an earlier moment so, in the case we need to abort, do
> it before changing anything.

aq_check_txsa_expiration() seems to be one of the issues? At least,
the lock is taken before and released afterwards. So what in
aq_check_txsa_expiration() requires the lock?

I don't like the use of rtnl_trylock(). It suggests the basic design is
wrong, or overly complex, and so probably not working correctly.

https://blog.ffwll.ch/2022/07/locking-engineering.html

Please try to identify what is being protected. If it is driver
internal state, could it be replaced with a driver mutex, rather than
RTNL? Or is it network stack as a whole state, which really does
require RTNL? If so, how do other drivers deal with this problem? Is
it specific to MACSEC? Does MACSEC have a design problem?

   Andrew

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop
  2022-10-15 15:09         ` Andrew Lunn
@ 2022-10-17  7:22           ` Íñigo Huguet
  2022-10-18  0:27             ` Andrew Lunn
  0 siblings, 1 reply; 20+ messages in thread
From: Íñigo Huguet @ 2022-10-17  7:22 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: irusskikh, dbogdanov, davem, edumazet, kuba, pabeni, netdev, Li Liang

On Sat, Oct 15, 2022 at 5:10 PM Andrew Lunn <andrew@lunn.ch> wrote:
>
> > > Maybe the lock needs to be moved closer to what actually needs to be
> > > protect? What is it protecting?
> >
> > It's protecting the operations of aq_macsec_enable and aq_macsec_work.
> > The locking was closer to them, but the idea of this patch is to move
> > the locking to an earlier moment so, in the case we need to abort, do
> > it before changing anything.
>
> aq_check_txsa_expiration() seems to be one of the issues? At least,
> the lock is taken before and released afterwards. So what in
> aq_check_txsa_expiration() requires the lock?

Basically everything in the file aq_macsec.c seems to be implicitly
protected by rtnl_lock. One group of functions are all callbacks of
the `struct macsec_ops aq_macsec_ops`, which are responsible for
configuring macsec offload, all called under rtnl_lock. The rest of
the functions in the file are called from ethtool, also protected by
rtnl_lock.

And part of the problem is that many of these operations are firmware
and/or phy configurations which I don't have documentation about how
they work. Despite this, it seems reasonable to think that they need
to be lock protected.

> I don't like the use of rtnl_trylock(). It suggests the basic design is
> wrong, or overly complex, and so probably not working correctly.
>
> https://blog.ffwll.ch/2022/07/locking-engineering.html
>
> Please try to identify what is being protected. If it is driver
> internal state, could it be replaced with a driver mutex, rather than
> RTNL? Or is it network stack as a whole state, which really does
> require RTNL? If so, how do other drivers deal with this problem? Is
> it specific to MACSEC? Does MACSEC have a design problem?

I already considered this possibility but discarded it because, as I
say above, everything else is already legitimately protected by
rtnl_lock.

The only alternative I can think of is to add a driver only mutex
(let's call it aq_macsec_mutex), as you say, and everytime that macsec
offload is to be changed both rtnl_lock and aq_macsec_mutex would be
taken. It's true that aq_macsec_mutex wouldn't be much contended
because almost always rtnl_lock needs to be acquired first. From the
workqueue and the threaded irq there wouldn't be any deadlock because
they only hold aq_macsec_mutex and ndo_stop only holds rtnl_lock. I
would also allow to put the locking close to what they protect.

I thought that this solution would be a bit overkill, but maybe it's
less overkill than the one I chose. If you're OK with this, I can
prepare an v2.


--
Íñigo Huguet


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop
  2022-10-17  7:22           ` Íñigo Huguet
@ 2022-10-18  0:27             ` Andrew Lunn
  2022-10-18  2:44               ` Jakub Kicinski
  2022-10-18  6:11               ` Íñigo Huguet
  0 siblings, 2 replies; 20+ messages in thread
From: Andrew Lunn @ 2022-10-18  0:27 UTC (permalink / raw)
  To: Íñigo Huguet
  Cc: irusskikh, dbogdanov, davem, edumazet, kuba, pabeni, netdev, Li Liang

> > Please try to identify what is being protected. If it is driver
> > internal state, could it be replaced with a driver mutex, rather than
> > RTNL? Or is it network stack as a whole state, which really does
> > require RTNL? If so, how do other drivers deal with this problem? Is
> > it specific to MACSEC? Does MACSEC have a design problem?
> 
> I already considered this possibility but discarded it because, as I
> say above, everything else is already legitimately protected by
> rtnl_lock.

Did you look at other drivers using MACSEC offload? Is this driver
unique in having stuff run in a work queue which you need to cancel?
In fact, it is not limited to MACSEC, it could be any work queue which
holds RTNL and needs to be cancelled.

      Andrew


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop
  2022-10-18  0:27             ` Andrew Lunn
@ 2022-10-18  2:44               ` Jakub Kicinski
  2022-10-18  6:15                 ` Íñigo Huguet
  2022-10-18  6:11               ` Íñigo Huguet
  1 sibling, 1 reply; 20+ messages in thread
From: Jakub Kicinski @ 2022-10-18  2:44 UTC (permalink / raw)
  To: Andrew Lunn, Íñigo Huguet
  Cc: irusskikh, dbogdanov, davem, edumazet, pabeni, netdev, Li Liang

On Tue, 18 Oct 2022 02:27:40 +0200 Andrew Lunn wrote:
> > > Please try to identify what is being protected. If it is driver
> > > internal state, could it be replaced with a driver mutex, rather than
> > > RTNL? Or is it network stack as a whole state, which really does
> > > require RTNL? If so, how do other drivers deal with this problem? Is
> > > it specific to MACSEC? Does MACSEC have a design problem?  
> > 
> > I already considered this possibility but discarded it because, as I
> > say above, everything else is already legitimately protected by
> > rtnl_lock.  
> 
> Did you look at other drivers using MACSEC offload? Is this driver
> unique in having stuff run in a work queue which you need to cancel?
> In fact, it is not limited to MACSEC, it could be any work queue which
> holds RTNL and needs to be cancelled.

FWIW the work APIs return a boolean to tell you if the work was
actually scheduled / canceled, and you can pair that with a reference
count of the netdev to avoid the typical _sync issues.

trigger()
	ASSERT_RTNL();
	if (schedule_work(netdev_priv->bla))
		netdev_hold();

work()
	rtnl_lock();
	if (netif_running())
		do_ya_thing();
	netdev_put();
	rtnl_unlock();

stop()
	ASSERT_RTNL();
	if (cancel_work(bla))
		netdev_put();

I think.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop
  2022-10-18  0:27             ` Andrew Lunn
  2022-10-18  2:44               ` Jakub Kicinski
@ 2022-10-18  6:11               ` Íñigo Huguet
  1 sibling, 0 replies; 20+ messages in thread
From: Íñigo Huguet @ 2022-10-18  6:11 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: irusskikh, dbogdanov, davem, edumazet, kuba, pabeni, netdev, Li Liang

On Tue, Oct 18, 2022 at 2:28 AM Andrew Lunn <andrew@lunn.ch> wrote:
> Did you look at other drivers using MACSEC offload? Is this driver
> unique in having stuff run in a work queue which you need to cancel?
> In fact, it is not limited to MACSEC, it could be any work queue which
> holds RTNL and needs to be cancelled.

Yes, I did.

About other drivers using MACSEC offload (which are only 2): they
don't have work or anything similar related to macsec where they need
to take rtnl_lock or any other lock. But in this driver the need of a
lock seems justified, at least as far as I can understand.

About other drivers having works that need to take rtnl_lock: they
cancel it in PCI shutdown/remove functions, then call
unregister_netdev, acquiring rtnl_lock. I considered doing the same,
but I didn't for 2 reasons:
1. There is no need to have a periodic task running for a stopped NIC
2. The task uses some resources that are deinitialized at NIC stop and
try to communicate with NIC's firmware. If it's stopped in a way
different to PCI shutdown/remove (i.e. ip link set down) the task
would continue to be executed and try to use these deinitialized
resources and communicate with a stopped hw.

Of course that point 2 possibly can be fixed to avoid doing it if NIC
has been stopped, but it still remains point 1. I didn't research if
other drivers really need to have the task running periodically even
with the NIC stopped, but I certainly know that this one doesn't need
it, looking at what the task does.

I do appreciate feedback, suggestions and changes requests (actually I
happily accepted to send v2 according to them, right?). But I'd rather
if they contained more specific proposals and examples of what I can
do to improve my patches, instead of just suggesting that I should do
some research before sending them, because I already did.

Best regards
-- 
Íñigo Huguet


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop
  2022-10-18  2:44               ` Jakub Kicinski
@ 2022-10-18  6:15                 ` Íñigo Huguet
  2022-10-18 15:59                   ` Jakub Kicinski
  0 siblings, 1 reply; 20+ messages in thread
From: Íñigo Huguet @ 2022-10-18  6:15 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Andrew Lunn, irusskikh, dbogdanov, davem, edumazet, pabeni,
	netdev, Li Liang

On Tue, Oct 18, 2022 at 4:44 AM Jakub Kicinski <kuba@kernel.org> wrote:
> FWIW the work APIs return a boolean to tell you if the work was
> actually scheduled / canceled, and you can pair that with a reference
> count of the netdev to avoid the typical _sync issues.
>
> trigger()
>         ASSERT_RTNL();
>         if (schedule_work(netdev_priv->bla))
>                 netdev_hold();
>
> work()
>         rtnl_lock();
>         if (netif_running())
>                 do_ya_thing();
>         netdev_put();
>         rtnl_unlock();
>
> stop()
>         ASSERT_RTNL();
>         if (cancel_work(bla))
>                 netdev_put();
>
> I think.
>

Interesting solution, I didn't even think of something like this.
However, despite not being 100% sure, I think that it's not valid in
this case because the work's task communicates with fw and uses
resources that are deinitialized at ndo_stop. That's why I think that
just holding a reference to the device is not enough.

-- 
Íñigo Huguet


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop
  2022-10-18  6:15                 ` Íñigo Huguet
@ 2022-10-18 15:59                   ` Jakub Kicinski
  2022-10-19  6:18                     ` Íñigo Huguet
  0 siblings, 1 reply; 20+ messages in thread
From: Jakub Kicinski @ 2022-10-18 15:59 UTC (permalink / raw)
  To: Íñigo Huguet
  Cc: Andrew Lunn, irusskikh, dbogdanov, davem, edumazet, pabeni,
	netdev, Li Liang

On Tue, 18 Oct 2022 08:15:38 +0200 Íñigo Huguet wrote:
> Interesting solution, I didn't even think of something like this.
> However, despite not being 100% sure, I think that it's not valid in
> this case because the work's task communicates with fw and uses
> resources that are deinitialized at ndo_stop. That's why I think that
> just holding a reference to the device is not enough.

You hold a reference to the netdev just to be able to take rtnl_lock()
and check if it's still running. If it is UP you're protected from it
going down due to rtnl_lock you took. If it's DOWN, as you say, it's not
safe to access all the bits so just unlock and return.

But because you're holding the reference it's safe to cancel_work()
without _sync on down, because the work itself will check if it should
have been canceled.

Dunno if that's a good explanation :S

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop
  2022-10-18 15:59                   ` Jakub Kicinski
@ 2022-10-19  6:18                     ` Íñigo Huguet
  2022-10-19 15:39                       ` Jakub Kicinski
  0 siblings, 1 reply; 20+ messages in thread
From: Íñigo Huguet @ 2022-10-19  6:18 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Andrew Lunn, irusskikh, dbogdanov, davem, edumazet, pabeni,
	netdev, Li Liang

On Tue, Oct 18, 2022 at 5:59 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Tue, 18 Oct 2022 08:15:38 +0200 Íñigo Huguet wrote:
> > Interesting solution, I didn't even think of something like this.
> > However, despite not being 100% sure, I think that it's not valid in
> > this case because the work's task communicates with fw and uses
> > resources that are deinitialized at ndo_stop. That's why I think that
> > just holding a reference to the device is not enough.
>
> You hold a reference to the netdev just to be able to take rtnl_lock()
> and check if it's still running. If it is UP you're protected from it
> going down due to rtnl_lock you took. If it's DOWN, as you say, it's not
> safe to access all the bits so just unlock and return.
>
> But because you're holding the reference it's safe to cancel_work()
> without _sync on down, because the work itself will check if it should
> have been canceled.
>
> Dunno if that's a good explanation :S

Yes, now I get it.

However, I think I won't use this strategy this time: rtnl_lock is
only needed in the work task if IS_ENABLED(CONFIG_MACSEC). Acquiring
rtnl_lock every time if macsec is not enabled wouldn't be protecting
anything, so it would be a waste. I think that the strategy suggested
by Andrew of adding a dedicated mutex to protect atlantic's macsec
operations makes more sense in this case. Do you agree?

-- 
Íñigo Huguet


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop
  2022-10-19  6:18                     ` Íñigo Huguet
@ 2022-10-19 15:39                       ` Jakub Kicinski
  2022-10-20  7:46                         ` Íñigo Huguet
  0 siblings, 1 reply; 20+ messages in thread
From: Jakub Kicinski @ 2022-10-19 15:39 UTC (permalink / raw)
  To: Íñigo Huguet
  Cc: Andrew Lunn, irusskikh, dbogdanov, davem, edumazet, pabeni,
	netdev, Li Liang

On Wed, 19 Oct 2022 08:18:29 +0200 Íñigo Huguet wrote:
> Yes, now I get it.
> 
> However, I think I won't use this strategy this time: rtnl_lock is
> only needed in the work task if IS_ENABLED(CONFIG_MACSEC). Acquiring
> rtnl_lock every time if macsec is not enabled wouldn't be protecting
> anything, so it would be a waste. I think that the strategy suggested
> by Andrew of adding a dedicated mutex to protect atlantic's macsec
> operations makes more sense in this case. Do you agree?

Dunno, locks don't protect operations, they protect state (as the link
Andrew sent probably explains?), so it's hard to say how easily you can
inject a new lock into this driver covering relevant bits. My gut
feeling is that refcounting would be less error prone. But I don't feel
strongly enough to force one choice over the other.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop
  2022-10-19 15:39                       ` Jakub Kicinski
@ 2022-10-20  7:46                         ` Íñigo Huguet
  0 siblings, 0 replies; 20+ messages in thread
From: Íñigo Huguet @ 2022-10-20  7:46 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Andrew Lunn, irusskikh, dbogdanov, davem, edumazet, pabeni,
	netdev, Li Liang

On Wed, Oct 19, 2022 at 5:39 PM Jakub Kicinski <kuba@kernel.org> wrote:
> Dunno, locks don't protect operations, they protect state (as the link
> Andrew sent probably explains?),

Yes, you're completely right. I understood that well, and Andrew's
link (which is very good, thanks Andrew) explains it too. I wrongly
said "operations" because in this case the lock/unlock must be done
mainly inside the macsec_ops (operations), and thus my confusion.

I'm about to send my patch. If you still feel that it's not the best
solution, feel free to insist on the refcount or any other approach
you think is better.

Thanks and sorry for the noise
-- 
Íñigo Huguet


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v2 net] atlantic: fix deadlock at aq_nic_stop
  2022-10-14 10:34 [PATCH net] atlantic: fix deadlock at aq_nic_stop Íñigo Huguet
  2022-10-14 12:13 ` Andrew Lunn
@ 2022-10-20  7:53 ` Íñigo Huguet
  2022-10-20  8:55   ` Igor Russkikh
  2022-10-24  9:30   ` patchwork-bot+netdevbpf
  1 sibling, 2 replies; 20+ messages in thread
From: Íñigo Huguet @ 2022-10-20  7:53 UTC (permalink / raw)
  To: irusskikh, kuba, andrew
  Cc: davem, edumazet, pabeni, dbogdanov, mstarovo, netdev,
	Íñigo Huguet, Li Liang

NIC is stopped with rtnl_lock held, and during the stop it cancels the
'service_task' work and free irqs.

However, if CONFIG_MACSEC is set, rtnl_lock is acquired both from
aq_nic_service_task and aq_linkstate_threaded_isr. Then a deadlock
happens if aq_nic_stop tries to cancel/disable them when they've already
started their execution.

As the deadlock is caused by rtnl_lock, it causes many other processes
to stall, not only atlantic related stuff.

Fix it by introducing a mutex that protects each NIC's macsec related
data, and locking it instead of the rtnl_lock from the service task and
the threaded IRQ.

Before this patch, all macsec data was protected with rtnl_lock, but
maybe not all of it needs to be protected. With this new mutex, further
efforts can be made to limit the protected data only to that which
requires it. However, probably it doesn't worth it because all macsec's
data accesses are infrequent, and almost all are done from macsec_ops
or ethtool callbacks, called holding rtnl_lock, so macsec_mutex won't
never be much contended.

The issue appeared repeteadly attaching and deattaching the NIC to a
bond interface. Doing that after this patch I cannot reproduce the bug.

Fixes: 62c1c2e606f6 ("net: atlantic: MACSec offload skeleton")
Reported-by: Li Liang <liali@redhat.com>
Suggested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Íñigo Huguet <ihuguet@redhat.com>

---

v2: per Andrew Lunn's suggestion, avoid rtnl_trylock approach and use a
dedicated mutex instead.
---
 .../ethernet/aquantia/atlantic/aq_macsec.c    | 96 ++++++++++++++-----
 .../net/ethernet/aquantia/atlantic/aq_nic.h   |  2 +
 2 files changed, 74 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c b/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c
index 3d0e16791e1c..a0180811305d 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c
@@ -1394,26 +1394,57 @@ static void aq_check_txsa_expiration(struct aq_nic_s *nic)
 			egress_sa_threshold_expired);
 }
 
+#define AQ_LOCKED_MDO_DEF(mdo)						\
+static int aq_locked_mdo_##mdo(struct macsec_context *ctx)		\
+{									\
+	struct aq_nic_s *nic = netdev_priv(ctx->netdev);		\
+	int ret;							\
+	mutex_lock(&nic->macsec_mutex);					\
+	ret = aq_mdo_##mdo(ctx);					\
+	mutex_unlock(&nic->macsec_mutex);				\
+	return ret;							\
+}
+
+AQ_LOCKED_MDO_DEF(dev_open)
+AQ_LOCKED_MDO_DEF(dev_stop)
+AQ_LOCKED_MDO_DEF(add_secy)
+AQ_LOCKED_MDO_DEF(upd_secy)
+AQ_LOCKED_MDO_DEF(del_secy)
+AQ_LOCKED_MDO_DEF(add_rxsc)
+AQ_LOCKED_MDO_DEF(upd_rxsc)
+AQ_LOCKED_MDO_DEF(del_rxsc)
+AQ_LOCKED_MDO_DEF(add_rxsa)
+AQ_LOCKED_MDO_DEF(upd_rxsa)
+AQ_LOCKED_MDO_DEF(del_rxsa)
+AQ_LOCKED_MDO_DEF(add_txsa)
+AQ_LOCKED_MDO_DEF(upd_txsa)
+AQ_LOCKED_MDO_DEF(del_txsa)
+AQ_LOCKED_MDO_DEF(get_dev_stats)
+AQ_LOCKED_MDO_DEF(get_tx_sc_stats)
+AQ_LOCKED_MDO_DEF(get_tx_sa_stats)
+AQ_LOCKED_MDO_DEF(get_rx_sc_stats)
+AQ_LOCKED_MDO_DEF(get_rx_sa_stats)
+
 const struct macsec_ops aq_macsec_ops = {
-	.mdo_dev_open = aq_mdo_dev_open,
-	.mdo_dev_stop = aq_mdo_dev_stop,
-	.mdo_add_secy = aq_mdo_add_secy,
-	.mdo_upd_secy = aq_mdo_upd_secy,
-	.mdo_del_secy = aq_mdo_del_secy,
-	.mdo_add_rxsc = aq_mdo_add_rxsc,
-	.mdo_upd_rxsc = aq_mdo_upd_rxsc,
-	.mdo_del_rxsc = aq_mdo_del_rxsc,
-	.mdo_add_rxsa = aq_mdo_add_rxsa,
-	.mdo_upd_rxsa = aq_mdo_upd_rxsa,
-	.mdo_del_rxsa = aq_mdo_del_rxsa,
-	.mdo_add_txsa = aq_mdo_add_txsa,
-	.mdo_upd_txsa = aq_mdo_upd_txsa,
-	.mdo_del_txsa = aq_mdo_del_txsa,
-	.mdo_get_dev_stats = aq_mdo_get_dev_stats,
-	.mdo_get_tx_sc_stats = aq_mdo_get_tx_sc_stats,
-	.mdo_get_tx_sa_stats = aq_mdo_get_tx_sa_stats,
-	.mdo_get_rx_sc_stats = aq_mdo_get_rx_sc_stats,
-	.mdo_get_rx_sa_stats = aq_mdo_get_rx_sa_stats,
+	.mdo_dev_open = aq_locked_mdo_dev_open,
+	.mdo_dev_stop = aq_locked_mdo_dev_stop,
+	.mdo_add_secy = aq_locked_mdo_add_secy,
+	.mdo_upd_secy = aq_locked_mdo_upd_secy,
+	.mdo_del_secy = aq_locked_mdo_del_secy,
+	.mdo_add_rxsc = aq_locked_mdo_add_rxsc,
+	.mdo_upd_rxsc = aq_locked_mdo_upd_rxsc,
+	.mdo_del_rxsc = aq_locked_mdo_del_rxsc,
+	.mdo_add_rxsa = aq_locked_mdo_add_rxsa,
+	.mdo_upd_rxsa = aq_locked_mdo_upd_rxsa,
+	.mdo_del_rxsa = aq_locked_mdo_del_rxsa,
+	.mdo_add_txsa = aq_locked_mdo_add_txsa,
+	.mdo_upd_txsa = aq_locked_mdo_upd_txsa,
+	.mdo_del_txsa = aq_locked_mdo_del_txsa,
+	.mdo_get_dev_stats = aq_locked_mdo_get_dev_stats,
+	.mdo_get_tx_sc_stats = aq_locked_mdo_get_tx_sc_stats,
+	.mdo_get_tx_sa_stats = aq_locked_mdo_get_tx_sa_stats,
+	.mdo_get_rx_sc_stats = aq_locked_mdo_get_rx_sc_stats,
+	.mdo_get_rx_sa_stats = aq_locked_mdo_get_rx_sa_stats,
 };
 
 int aq_macsec_init(struct aq_nic_s *nic)
@@ -1435,6 +1466,7 @@ int aq_macsec_init(struct aq_nic_s *nic)
 
 	nic->ndev->features |= NETIF_F_HW_MACSEC;
 	nic->ndev->macsec_ops = &aq_macsec_ops;
+	mutex_init(&nic->macsec_mutex);
 
 	return 0;
 }
@@ -1458,7 +1490,7 @@ int aq_macsec_enable(struct aq_nic_s *nic)
 	if (!nic->macsec_cfg)
 		return 0;
 
-	rtnl_lock();
+	mutex_lock(&nic->macsec_mutex);
 
 	if (nic->aq_fw_ops->send_macsec_req) {
 		struct macsec_cfg_request cfg = { 0 };
@@ -1507,7 +1539,7 @@ int aq_macsec_enable(struct aq_nic_s *nic)
 	ret = aq_apply_macsec_cfg(nic);
 
 unlock:
-	rtnl_unlock();
+	mutex_unlock(&nic->macsec_mutex);
 	return ret;
 }
 
@@ -1519,9 +1551,9 @@ void aq_macsec_work(struct aq_nic_s *nic)
 	if (!netif_carrier_ok(nic->ndev))
 		return;
 
-	rtnl_lock();
+	mutex_lock(&nic->macsec_mutex);
 	aq_check_txsa_expiration(nic);
-	rtnl_unlock();
+	mutex_unlock(&nic->macsec_mutex);
 }
 
 int aq_macsec_rx_sa_cnt(struct aq_nic_s *nic)
@@ -1532,21 +1564,30 @@ int aq_macsec_rx_sa_cnt(struct aq_nic_s *nic)
 	if (!cfg)
 		return 0;
 
+	mutex_lock(&nic->macsec_mutex);
+
 	for (i = 0; i < AQ_MACSEC_MAX_SC; i++) {
 		if (!test_bit(i, &cfg->rxsc_idx_busy))
 			continue;
 		cnt += hweight_long(cfg->aq_rxsc[i].rx_sa_idx_busy);
 	}
 
+	mutex_unlock(&nic->macsec_mutex);
 	return cnt;
 }
 
 int aq_macsec_tx_sc_cnt(struct aq_nic_s *nic)
 {
+	int cnt;
+
 	if (!nic->macsec_cfg)
 		return 0;
 
-	return hweight_long(nic->macsec_cfg->txsc_idx_busy);
+	mutex_lock(&nic->macsec_mutex);
+	cnt = hweight_long(nic->macsec_cfg->txsc_idx_busy);
+	mutex_unlock(&nic->macsec_mutex);
+
+	return cnt;
 }
 
 int aq_macsec_tx_sa_cnt(struct aq_nic_s *nic)
@@ -1557,12 +1598,15 @@ int aq_macsec_tx_sa_cnt(struct aq_nic_s *nic)
 	if (!cfg)
 		return 0;
 
+	mutex_lock(&nic->macsec_mutex);
+
 	for (i = 0; i < AQ_MACSEC_MAX_SC; i++) {
 		if (!test_bit(i, &cfg->txsc_idx_busy))
 			continue;
 		cnt += hweight_long(cfg->aq_txsc[i].tx_sa_idx_busy);
 	}
 
+	mutex_unlock(&nic->macsec_mutex);
 	return cnt;
 }
 
@@ -1634,6 +1678,8 @@ u64 *aq_macsec_get_stats(struct aq_nic_s *nic, u64 *data)
 	if (!cfg)
 		return data;
 
+	mutex_lock(&nic->macsec_mutex);
+
 	aq_macsec_update_stats(nic);
 
 	common_stats = &cfg->stats;
@@ -1716,5 +1762,7 @@ u64 *aq_macsec_get_stats(struct aq_nic_s *nic, u64 *data)
 
 	data += i;
 
+	mutex_unlock(&nic->macsec_mutex);
+
 	return data;
 }
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
index 935ba889bd9a..ad33f8586532 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
@@ -157,6 +157,8 @@ struct aq_nic_s {
 	struct mutex fwreq_mutex;
 #if IS_ENABLED(CONFIG_MACSEC)
 	struct aq_macsec_cfg *macsec_cfg;
+	/* mutex to protect data in macsec_cfg */
+	struct mutex macsec_mutex;
 #endif
 	/* PTP support */
 	struct aq_ptp_s *aq_ptp;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 net] atlantic: fix deadlock at aq_nic_stop
  2022-10-20  7:53 ` [PATCH v2 " Íñigo Huguet
@ 2022-10-20  8:55   ` Igor Russkikh
  2022-10-20 16:17     ` Íñigo Huguet
  2022-10-21 19:01     ` Íñigo Huguet
  2022-10-24  9:30   ` patchwork-bot+netdevbpf
  1 sibling, 2 replies; 20+ messages in thread
From: Igor Russkikh @ 2022-10-20  8:55 UTC (permalink / raw)
  To: Íñigo Huguet, kuba, andrew
  Cc: davem, edumazet, pabeni, mstarovo, netdev, Li Liang



On 10/20/2022 9:53 AM, Íñigo Huguet wrote:
> NIC is stopped with rtnl_lock held, and during the stop it cancels the
> 'service_task' work and free irqs.

Hi Íñigo, thanks for taking care of this.

Just reviewed, overall looks reasonable for me. Unfortunately I don't recall
now why RTNL lock was used originally, most probably we've tried to secure
parallel "ip macsec configure something" commands execution.

But the model with internal mutex looks safe for me.

Unfortunately I now have no ability to verify your patch, edge usecase here
would be to try stress running in parallel:
"ethtool -S <iface>"
"ip macsec show"
"ip macsec <change something>"
Plus ideal would be link flipping.

Have you tried something like that?

Reviewed-by: Igor Russkikh <irusskikh@marvell.com>

Regards,
  Igor

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 net] atlantic: fix deadlock at aq_nic_stop
  2022-10-20  8:55   ` Igor Russkikh
@ 2022-10-20 16:17     ` Íñigo Huguet
  2022-10-21 19:01     ` Íñigo Huguet
  1 sibling, 0 replies; 20+ messages in thread
From: Íñigo Huguet @ 2022-10-20 16:17 UTC (permalink / raw)
  To: Igor Russkikh
  Cc: kuba, andrew, davem, edumazet, pabeni, mstarovo, netdev, Li Liang

On Thu, Oct 20, 2022 at 10:56 AM Igor Russkikh <irusskikh@marvell.com> wrote:
>
>
>
> On 10/20/2022 9:53 AM, Íñigo Huguet wrote:
> > NIC is stopped with rtnl_lock held, and during the stop it cancels the
> > 'service_task' work and free irqs.
>
> Hi Íñigo, thanks for taking care of this.
>
> Just reviewed, overall looks reasonable for me. Unfortunately I don't recall
> now why RTNL lock was used originally, most probably we've tried to secure
> parallel "ip macsec configure something" commands execution.
>
> But the model with internal mutex looks safe for me.
>
> Unfortunately I now have no ability to verify your patch, edge usecase here
> would be to try stress running in parallel:
> "ethtool -S <iface>"
> "ip macsec show"
> "ip macsec <change something>"
> Plus ideal would be link flipping.
>
> Have you tried something like that?

I will try to run that tomorrow, thanks!

>
> Reviewed-by: Igor Russkikh <irusskikh@marvell.com>
>
> Regards,
>   Igor
>


-- 
Íñigo Huguet


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 net] atlantic: fix deadlock at aq_nic_stop
  2022-10-20  8:55   ` Igor Russkikh
  2022-10-20 16:17     ` Íñigo Huguet
@ 2022-10-21 19:01     ` Íñigo Huguet
  1 sibling, 0 replies; 20+ messages in thread
From: Íñigo Huguet @ 2022-10-21 19:01 UTC (permalink / raw)
  To: Igor Russkikh
  Cc: kuba, andrew, davem, edumazet, pabeni, mstarovo, netdev, Li Liang

On Thu, Oct 20, 2022 at 10:56 AM Igor Russkikh <irusskikh@marvell.com> wrote:
>
>
>
> On 10/20/2022 9:53 AM, Íñigo Huguet wrote:
> > NIC is stopped with rtnl_lock held, and during the stop it cancels the
> > 'service_task' work and free irqs.
>
> Hi Íñigo, thanks for taking care of this.
>
> Just reviewed, overall looks reasonable for me. Unfortunately I don't recall
> now why RTNL lock was used originally, most probably we've tried to secure
> parallel "ip macsec configure something" commands execution.
>
> But the model with internal mutex looks safe for me.
>
> Unfortunately I now have no ability to verify your patch, edge usecase here
> would be to try stress running in parallel:
> "ethtool -S <iface>"
> "ip macsec show"
> "ip macsec <change something>"
> Plus ideal would be link flipping.

I've been running all this in parallel for some hours without issue.


>
> Have you tried something like that?
>
> Reviewed-by: Igor Russkikh <irusskikh@marvell.com>
>
> Regards,
>   Igor
>


--
Íñigo Huguet


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 net] atlantic: fix deadlock at aq_nic_stop
  2022-10-20  7:53 ` [PATCH v2 " Íñigo Huguet
  2022-10-20  8:55   ` Igor Russkikh
@ 2022-10-24  9:30   ` patchwork-bot+netdevbpf
  1 sibling, 0 replies; 20+ messages in thread
From: patchwork-bot+netdevbpf @ 2022-10-24  9:30 UTC (permalink / raw)
  To: =?utf-8?b?w43DsWlnbyBIdWd1ZXQgPGlodWd1ZXRAcmVkaGF0LmNvbT4=?=
  Cc: irusskikh, kuba, andrew, davem, edumazet, pabeni, dbogdanov,
	mstarovo, netdev, liali

Hello:

This patch was applied to netdev/net.git (master)
by David S. Miller <davem@davemloft.net>:

On Thu, 20 Oct 2022 09:53:10 +0200 you wrote:
> NIC is stopped with rtnl_lock held, and during the stop it cancels the
> 'service_task' work and free irqs.
> 
> However, if CONFIG_MACSEC is set, rtnl_lock is acquired both from
> aq_nic_service_task and aq_linkstate_threaded_isr. Then a deadlock
> happens if aq_nic_stop tries to cancel/disable them when they've already
> started their execution.
> 
> [...]

Here is the summary with links:
  - [v2,net] atlantic: fix deadlock at aq_nic_stop
    https://git.kernel.org/netdev/net/c/6960d133f66e

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2022-10-24  9:30 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-14 10:34 [PATCH net] atlantic: fix deadlock at aq_nic_stop Íñigo Huguet
2022-10-14 12:13 ` Andrew Lunn
2022-10-14 12:43   ` Íñigo Huguet
2022-10-14 13:35     ` Andrew Lunn
2022-10-14 13:44       ` Íñigo Huguet
2022-10-15 15:09         ` Andrew Lunn
2022-10-17  7:22           ` Íñigo Huguet
2022-10-18  0:27             ` Andrew Lunn
2022-10-18  2:44               ` Jakub Kicinski
2022-10-18  6:15                 ` Íñigo Huguet
2022-10-18 15:59                   ` Jakub Kicinski
2022-10-19  6:18                     ` Íñigo Huguet
2022-10-19 15:39                       ` Jakub Kicinski
2022-10-20  7:46                         ` Íñigo Huguet
2022-10-18  6:11               ` Íñigo Huguet
2022-10-20  7:53 ` [PATCH v2 " Íñigo Huguet
2022-10-20  8:55   ` Igor Russkikh
2022-10-20 16:17     ` Íñigo Huguet
2022-10-21 19:01     ` Íñigo Huguet
2022-10-24  9:30   ` patchwork-bot+netdevbpf

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.