All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] ath10k: move irq setup
@ 2013-07-18  6:39 ` Michal Kazior
  0 siblings, 0 replies; 18+ messages in thread
From: Michal Kazior @ 2013-07-18  6:39 UTC (permalink / raw)
  To: ath10k; +Cc: linux-wireless, Michal Kazior

There was a slight race during PCI shutdown. Since
interrupts weren't really stopped (only Copy
Engine interrupts were disabled through device hw
registers) it was possible for a firmware
indication (crash) interrupt to come in after
tasklets were synced/killed. This would cause
memory corruption and a panic in most cases. It
was also possible for interrupt to come before CE
was initialized during device probing.

Interrupts are required for BMI phase so they are
enabled as soon as power_up() is called but are
freed upon both power_down() and stop() so there's
asymmetry here. As by design stop() cannot be
followed by start() it is okay. Both power_down()
and stop() should be merged later on to avoid
confusion.

Before this can be really properly fixed var/hw
init code split is necessary.

Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
---

Please note: this is based on my (still under
review at the time of posting) previous patchests:
device setup refactor and recovery.

I'm posting this before those patchsets are merged
so anyone interested in testing this fix (I can't
reproduce the problem on my setup) can give it a
try.

 drivers/net/wireless/ath/ath10k/pci.c |   36 +++++++++++++++++++++++----------
 drivers/net/wireless/ath/ath10k/pci.h |    1 +
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/pci.c b/drivers/net/wireless/ath/ath10k/pci.c
index c71b488..e814151 100644
--- a/drivers/net/wireless/ath/ath10k/pci.c
+++ b/drivers/net/wireless/ath/ath10k/pci.c
@@ -56,6 +56,8 @@ static void ath10k_pci_rx_pipe_cleanup(struct hif_ce_pipe_info *pipe_info);
 static void ath10k_pci_stop_ce(struct ath10k *ar);
 static void ath10k_pci_device_reset(struct ath10k *ar);
 static int ath10k_pci_reset_target(struct ath10k *ar);
+static int ath10k_pci_start_intr(struct ath10k *ar);
+static void ath10k_pci_stop_intr(struct ath10k *ar);
 
 static const struct ce_attr host_ce_config_wlan[] = {
 	/* host->target HTC control and raw streams */
@@ -827,6 +829,7 @@ static void ath10k_pci_stop_ce(struct ath10k *ar)
 	int i;
 
 	ath10k_ce_disable_interrupts(ar);
+	ath10k_pci_stop_intr(ar);
 
 	/* Cancel the pending tasklet */
 	tasklet_kill(&ar_pci->intr_tq);
@@ -1742,6 +1745,12 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
 {
 	int ret;
 
+	ret = ath10k_pci_start_intr(ar);
+	if (ret) {
+		ath10k_err("could not start interrupt handling (%d)\n", ret);
+		goto err;
+	}
+
 	/*
 	 * Bring the target up cleanly.
 	 *
@@ -1756,7 +1765,7 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
 
 	ret = ath10k_pci_reset_target(ar);
 	if (ret)
-		goto err;
+		goto err_intr;
 
 	if (ath10k_target_ps) {
 		ath10k_dbg(ATH10K_DBG_PCI, "on-chip power save enabled\n");
@@ -1783,16 +1792,24 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
 	return 0;
 
 err_ce:
+	/* XXX: Until var/hw init is split it's impossible to fix the ordering
+	 * here so we must call stop_intr() here too to prevent interrupts after
+	 * CE is teared down. It's okay to double call the stop_intr() */
+	ath10k_pci_stop_intr(ar);
 	ath10k_pci_ce_deinit(ar);
 err_ps:
 	if (!ath10k_target_ps)
 		ath10k_do_pci_sleep(ar);
+err_intr:
+	ath10k_pci_stop_intr(ar);
 err:
 	return ret;
 }
 
 static void ath10k_pci_hif_power_down(struct ath10k *ar)
 {
+	ath10k_pci_stop_intr(ar);
+
 	ath10k_pci_ce_deinit(ar);
 	if (!ath10k_target_ps)
 		ath10k_do_pci_sleep(ar);
@@ -2119,6 +2136,7 @@ static int ath10k_pci_start_intr(struct ath10k *ar)
 	ret = ath10k_pci_start_intr_legacy(ar);
 
 exit:
+	ar_pci->intr_started = ret == 0;
 	ar_pci->num_msi_intrs = num;
 	ar_pci->ce_count = CE_COUNT;
 	return ret;
@@ -2129,6 +2147,9 @@ static void ath10k_pci_stop_intr(struct ath10k *ar)
 	struct ath10k_pci *ar_pci = ath10k_pci_priv(ar);
 	int i;
 
+	if (ar_pci->intr_started == false)
+		return;
+
 	/* There's at least one interrupt irregardless whether its legacy INTR
 	 * or MSI or MSI-X */
 	for (i = 0; i < max(1, ar_pci->num_msi_intrs); i++)
@@ -2136,6 +2157,8 @@ static void ath10k_pci_stop_intr(struct ath10k *ar)
 
 	if (ar_pci->num_msi_intrs > 0)
 		pci_disable_msi(ar_pci->pdev);
+
+	ar_pci->intr_started = false;
 }
 
 static int ath10k_pci_reset_target(struct ath10k *ar)
@@ -2358,22 +2381,14 @@ static int ath10k_pci_probe(struct pci_dev *pdev,
 
 	ar_pci->cacheline_sz = dma_get_cache_alignment();
 
-	ret = ath10k_pci_start_intr(ar);
-	if (ret) {
-		ath10k_err("could not start interrupt handling (%d)\n", ret);
-		goto err_iomap;
-	}
-
 	ret = ath10k_core_register(ar);
 	if (ret) {
 		ath10k_err("could not register driver core (%d)\n", ret);
-		goto err_intr;
+		goto err_iomap;
 	}
 
 	return 0;
 
-err_intr:
-	ath10k_pci_stop_intr(ar);
 err_iomap:
 	pci_iounmap(pdev, mem);
 err_master:
@@ -2410,7 +2425,6 @@ static void ath10k_pci_remove(struct pci_dev *pdev)
 	tasklet_kill(&ar_pci->msi_fw_err);
 
 	ath10k_core_unregister(ar);
-	ath10k_pci_stop_intr(ar);
 
 	pci_set_drvdata(pdev, NULL);
 	pci_iounmap(pdev, ar_pci->mem);
diff --git a/drivers/net/wireless/ath/ath10k/pci.h b/drivers/net/wireless/ath/ath10k/pci.h
index d3a2e6c..5eb628f 100644
--- a/drivers/net/wireless/ath/ath10k/pci.h
+++ b/drivers/net/wireless/ath/ath10k/pci.h
@@ -198,6 +198,7 @@ struct ath10k_pci {
 	 * interrupts.
 	 */
 	int num_msi_intrs;
+	bool intr_started;
 
 	struct tasklet_struct intr_tq;
 	struct tasklet_struct msi_fw_err;
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH] ath10k: move irq setup
@ 2013-07-18  6:39 ` Michal Kazior
  0 siblings, 0 replies; 18+ messages in thread
From: Michal Kazior @ 2013-07-18  6:39 UTC (permalink / raw)
  To: ath10k; +Cc: linux-wireless, Michal Kazior

There was a slight race during PCI shutdown. Since
interrupts weren't really stopped (only Copy
Engine interrupts were disabled through device hw
registers) it was possible for a firmware
indication (crash) interrupt to come in after
tasklets were synced/killed. This would cause
memory corruption and a panic in most cases. It
was also possible for interrupt to come before CE
was initialized during device probing.

Interrupts are required for BMI phase so they are
enabled as soon as power_up() is called but are
freed upon both power_down() and stop() so there's
asymmetry here. As by design stop() cannot be
followed by start() it is okay. Both power_down()
and stop() should be merged later on to avoid
confusion.

Before this can be really properly fixed var/hw
init code split is necessary.

Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
---

Please note: this is based on my (still under
review at the time of posting) previous patchests:
device setup refactor and recovery.

I'm posting this before those patchsets are merged
so anyone interested in testing this fix (I can't
reproduce the problem on my setup) can give it a
try.

 drivers/net/wireless/ath/ath10k/pci.c |   36 +++++++++++++++++++++++----------
 drivers/net/wireless/ath/ath10k/pci.h |    1 +
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/pci.c b/drivers/net/wireless/ath/ath10k/pci.c
index c71b488..e814151 100644
--- a/drivers/net/wireless/ath/ath10k/pci.c
+++ b/drivers/net/wireless/ath/ath10k/pci.c
@@ -56,6 +56,8 @@ static void ath10k_pci_rx_pipe_cleanup(struct hif_ce_pipe_info *pipe_info);
 static void ath10k_pci_stop_ce(struct ath10k *ar);
 static void ath10k_pci_device_reset(struct ath10k *ar);
 static int ath10k_pci_reset_target(struct ath10k *ar);
+static int ath10k_pci_start_intr(struct ath10k *ar);
+static void ath10k_pci_stop_intr(struct ath10k *ar);
 
 static const struct ce_attr host_ce_config_wlan[] = {
 	/* host->target HTC control and raw streams */
@@ -827,6 +829,7 @@ static void ath10k_pci_stop_ce(struct ath10k *ar)
 	int i;
 
 	ath10k_ce_disable_interrupts(ar);
+	ath10k_pci_stop_intr(ar);
 
 	/* Cancel the pending tasklet */
 	tasklet_kill(&ar_pci->intr_tq);
@@ -1742,6 +1745,12 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
 {
 	int ret;
 
+	ret = ath10k_pci_start_intr(ar);
+	if (ret) {
+		ath10k_err("could not start interrupt handling (%d)\n", ret);
+		goto err;
+	}
+
 	/*
 	 * Bring the target up cleanly.
 	 *
@@ -1756,7 +1765,7 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
 
 	ret = ath10k_pci_reset_target(ar);
 	if (ret)
-		goto err;
+		goto err_intr;
 
 	if (ath10k_target_ps) {
 		ath10k_dbg(ATH10K_DBG_PCI, "on-chip power save enabled\n");
@@ -1783,16 +1792,24 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
 	return 0;
 
 err_ce:
+	/* XXX: Until var/hw init is split it's impossible to fix the ordering
+	 * here so we must call stop_intr() here too to prevent interrupts after
+	 * CE is teared down. It's okay to double call the stop_intr() */
+	ath10k_pci_stop_intr(ar);
 	ath10k_pci_ce_deinit(ar);
 err_ps:
 	if (!ath10k_target_ps)
 		ath10k_do_pci_sleep(ar);
+err_intr:
+	ath10k_pci_stop_intr(ar);
 err:
 	return ret;
 }
 
 static void ath10k_pci_hif_power_down(struct ath10k *ar)
 {
+	ath10k_pci_stop_intr(ar);
+
 	ath10k_pci_ce_deinit(ar);
 	if (!ath10k_target_ps)
 		ath10k_do_pci_sleep(ar);
@@ -2119,6 +2136,7 @@ static int ath10k_pci_start_intr(struct ath10k *ar)
 	ret = ath10k_pci_start_intr_legacy(ar);
 
 exit:
+	ar_pci->intr_started = ret == 0;
 	ar_pci->num_msi_intrs = num;
 	ar_pci->ce_count = CE_COUNT;
 	return ret;
@@ -2129,6 +2147,9 @@ static void ath10k_pci_stop_intr(struct ath10k *ar)
 	struct ath10k_pci *ar_pci = ath10k_pci_priv(ar);
 	int i;
 
+	if (ar_pci->intr_started == false)
+		return;
+
 	/* There's at least one interrupt irregardless whether its legacy INTR
 	 * or MSI or MSI-X */
 	for (i = 0; i < max(1, ar_pci->num_msi_intrs); i++)
@@ -2136,6 +2157,8 @@ static void ath10k_pci_stop_intr(struct ath10k *ar)
 
 	if (ar_pci->num_msi_intrs > 0)
 		pci_disable_msi(ar_pci->pdev);
+
+	ar_pci->intr_started = false;
 }
 
 static int ath10k_pci_reset_target(struct ath10k *ar)
@@ -2358,22 +2381,14 @@ static int ath10k_pci_probe(struct pci_dev *pdev,
 
 	ar_pci->cacheline_sz = dma_get_cache_alignment();
 
-	ret = ath10k_pci_start_intr(ar);
-	if (ret) {
-		ath10k_err("could not start interrupt handling (%d)\n", ret);
-		goto err_iomap;
-	}
-
 	ret = ath10k_core_register(ar);
 	if (ret) {
 		ath10k_err("could not register driver core (%d)\n", ret);
-		goto err_intr;
+		goto err_iomap;
 	}
 
 	return 0;
 
-err_intr:
-	ath10k_pci_stop_intr(ar);
 err_iomap:
 	pci_iounmap(pdev, mem);
 err_master:
@@ -2410,7 +2425,6 @@ static void ath10k_pci_remove(struct pci_dev *pdev)
 	tasklet_kill(&ar_pci->msi_fw_err);
 
 	ath10k_core_unregister(ar);
-	ath10k_pci_stop_intr(ar);
 
 	pci_set_drvdata(pdev, NULL);
 	pci_iounmap(pdev, ar_pci->mem);
diff --git a/drivers/net/wireless/ath/ath10k/pci.h b/drivers/net/wireless/ath/ath10k/pci.h
index d3a2e6c..5eb628f 100644
--- a/drivers/net/wireless/ath/ath10k/pci.h
+++ b/drivers/net/wireless/ath/ath10k/pci.h
@@ -198,6 +198,7 @@ struct ath10k_pci {
 	 * interrupts.
 	 */
 	int num_msi_intrs;
+	bool intr_started;
 
 	struct tasklet_struct intr_tq;
 	struct tasklet_struct msi_fw_err;
-- 
1.7.9.5


_______________________________________________
ath10k mailing list
ath10k@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/ath10k

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH] ath10k: move irq setup
  2013-07-18  6:39 ` Michal Kazior
@ 2013-07-30 18:35   ` Kalle Valo
  -1 siblings, 0 replies; 18+ messages in thread
From: Kalle Valo @ 2013-07-30 18:35 UTC (permalink / raw)
  To: Michal Kazior; +Cc: ath10k, linux-wireless

Michal Kazior <michal.kazior@tieto.com> writes:

> There was a slight race during PCI shutdown. Since
> interrupts weren't really stopped (only Copy
> Engine interrupts were disabled through device hw
> registers) it was possible for a firmware
> indication (crash) interrupt to come in after
> tasklets were synced/killed. This would cause
> memory corruption and a panic in most cases. It
> was also possible for interrupt to come before CE
> was initialized during device probing.
>
> Interrupts are required for BMI phase so they are enabled as soon as
> power_up() is called but are freed upon both power_down() and stop()
> so there's asymmetry here. As by design stop() cannot be followed by
> start() it is okay. Both power_down() and stop() should be merged
> later on to avoid confusion.

Why are the interrupts freed both in power_down() and stop()? I don't
get that.

What if we call disable_irq() in power_down() instead?

> Before this can be really properly fixed var/hw
> init code split is necessary.
>
> Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
> ---
>
> Please note: this is based on my (still under
> review at the time of posting) previous patchests:
> device setup refactor and recovery.
>
> I'm posting this before those patchsets are merged
> so anyone interested in testing this fix (I can't
> reproduce the problem on my setup) can give it a
> try.

This was reported by Ben, right? So this sould have a Reported-by line
attributing him.

> @@ -1783,16 +1792,24 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
>  	return 0;
>  
>  err_ce:
> +	/* XXX: Until var/hw init is split it's impossible to fix the ordering
> +	 * here so we must call stop_intr() here too to prevent interrupts after
> +	 * CE is teared down. It's okay to double call the stop_intr()
> */

"FIXME:"

>  exit:
> +	ar_pci->intr_started = ret == 0;

A bit too clever for the sake of readibility for my taste, but I guess
it's ok.

> --- a/drivers/net/wireless/ath/ath10k/pci.h
> +++ b/drivers/net/wireless/ath/ath10k/pci.h
> @@ -198,6 +198,7 @@ struct ath10k_pci {
>  	 * interrupts.
>  	 */
>  	int num_msi_intrs;
> +	bool intr_started;

Adding a new state variable makes me worried. I really would prefer a
solution which would not require that.

Also if we call request_irq() in ath10k_pci_probe() we should also call
free_irq() in ath10k_pci_remove() for symmetry. Just doing a temporary
hack will most likely stay forever :)

-- 
Kalle Valo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] ath10k: move irq setup
@ 2013-07-30 18:35   ` Kalle Valo
  0 siblings, 0 replies; 18+ messages in thread
From: Kalle Valo @ 2013-07-30 18:35 UTC (permalink / raw)
  To: Michal Kazior; +Cc: linux-wireless, ath10k

Michal Kazior <michal.kazior@tieto.com> writes:

> There was a slight race during PCI shutdown. Since
> interrupts weren't really stopped (only Copy
> Engine interrupts were disabled through device hw
> registers) it was possible for a firmware
> indication (crash) interrupt to come in after
> tasklets were synced/killed. This would cause
> memory corruption and a panic in most cases. It
> was also possible for interrupt to come before CE
> was initialized during device probing.
>
> Interrupts are required for BMI phase so they are enabled as soon as
> power_up() is called but are freed upon both power_down() and stop()
> so there's asymmetry here. As by design stop() cannot be followed by
> start() it is okay. Both power_down() and stop() should be merged
> later on to avoid confusion.

Why are the interrupts freed both in power_down() and stop()? I don't
get that.

What if we call disable_irq() in power_down() instead?

> Before this can be really properly fixed var/hw
> init code split is necessary.
>
> Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
> ---
>
> Please note: this is based on my (still under
> review at the time of posting) previous patchests:
> device setup refactor and recovery.
>
> I'm posting this before those patchsets are merged
> so anyone interested in testing this fix (I can't
> reproduce the problem on my setup) can give it a
> try.

This was reported by Ben, right? So this sould have a Reported-by line
attributing him.

> @@ -1783,16 +1792,24 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
>  	return 0;
>  
>  err_ce:
> +	/* XXX: Until var/hw init is split it's impossible to fix the ordering
> +	 * here so we must call stop_intr() here too to prevent interrupts after
> +	 * CE is teared down. It's okay to double call the stop_intr()
> */

"FIXME:"

>  exit:
> +	ar_pci->intr_started = ret == 0;

A bit too clever for the sake of readibility for my taste, but I guess
it's ok.

> --- a/drivers/net/wireless/ath/ath10k/pci.h
> +++ b/drivers/net/wireless/ath/ath10k/pci.h
> @@ -198,6 +198,7 @@ struct ath10k_pci {
>  	 * interrupts.
>  	 */
>  	int num_msi_intrs;
> +	bool intr_started;

Adding a new state variable makes me worried. I really would prefer a
solution which would not require that.

Also if we call request_irq() in ath10k_pci_probe() we should also call
free_irq() in ath10k_pci_remove() for symmetry. Just doing a temporary
hack will most likely stay forever :)

-- 
Kalle Valo

_______________________________________________
ath10k mailing list
ath10k@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/ath10k

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] ath10k: move irq setup
  2013-07-30 18:35   ` Kalle Valo
@ 2013-07-31  5:50     ` Michal Kazior
  -1 siblings, 0 replies; 18+ messages in thread
From: Michal Kazior @ 2013-07-31  5:50 UTC (permalink / raw)
  To: Kalle Valo; +Cc: ath10k, linux-wireless

On 30 July 2013 20:35, Kalle Valo <kvalo@qca.qualcomm.com> wrote:
> Michal Kazior <michal.kazior@tieto.com> writes:
>
>> There was a slight race during PCI shutdown. Since
>> interrupts weren't really stopped (only Copy
>> Engine interrupts were disabled through device hw
>> registers) it was possible for a firmware
>> indication (crash) interrupt to come in after
>> tasklets were synced/killed. This would cause
>> memory corruption and a panic in most cases. It
>> was also possible for interrupt to come before CE
>> was initialized during device probing.
>>
>> Interrupts are required for BMI phase so they are enabled as soon as
>> power_up() is called but are freed upon both power_down() and stop()
>> so there's asymmetry here. As by design stop() cannot be followed by
>> start() it is okay. Both power_down() and stop() should be merged
>> later on to avoid confusion.
>
> Why are the interrupts freed both in power_down() and stop()? I don't
> get that.
>
> What if we call disable_irq() in power_down() instead?

power_down() must call free_irq(), because power_up() calls
request_irq() (if you want the symmetry). If anything, the stop()
should call disable_irq(), but wouldn't that mean start() should call
enable_irq()? But than, irqs are needed before start()..

I did think about disable_irq() but AFAIR you need to enable_irq()
later on (so either way you need to keep track of the irq state or
you'll get a ton of WARN_ONs from the system). I'll double check that
and report back later


>> Before this can be really properly fixed var/hw
>> init code split is necessary.
>>
>> Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
>> ---
>>
>> Please note: this is based on my (still under
>> review at the time of posting) previous patchests:
>> device setup refactor and recovery.
>>
>> I'm posting this before those patchsets are merged
>> so anyone interested in testing this fix (I can't
>> reproduce the problem on my setup) can give it a
>> try.
>
> This was reported by Ben, right? So this sould have a Reported-by line
> attributing him.

Yes. I'll fix that, provided we get through the review with the patch :)


>> @@ -1783,16 +1792,24 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
>>       return 0;
>>
>>  err_ce:
>> +     /* XXX: Until var/hw init is split it's impossible to fix the ordering
>> +      * here so we must call stop_intr() here too to prevent interrupts after
>> +      * CE is teared down. It's okay to double call the stop_intr()
>> */
>
> "FIXME:"

Ok.



>>  exit:
>> +     ar_pci->intr_started = ret == 0;
>
> A bit too clever for the sake of readibility for my taste, but I guess
> it's ok.
>
>> --- a/drivers/net/wireless/ath/ath10k/pci.h
>> +++ b/drivers/net/wireless/ath/ath10k/pci.h
>> @@ -198,6 +198,7 @@ struct ath10k_pci {
>>        * interrupts.
>>        */
>>       int num_msi_intrs;
>> +     bool intr_started;
>
> Adding a new state variable makes me worried. I really would prefer a
> solution which would not require that.

I know that. That's why I mentioned in the commit log that it is more
of a workaround than a real fix. Me, I don't like this either but a
real fix requires a lot of rework from what I can tell.

This bug can be triggered more easily now apparently after recovery
patches went in. I'm not experiencing it but I get reports of rare
panics when a machine is left idle for a very long time with
interfaces brought down.


> Also if we call request_irq() in ath10k_pci_probe() we should also call
> free_irq() in ath10k_pci_remove() for symmetry. Just doing a temporary
> hack will most likely stay forever :)

With the patch interrupts are temporarily enabled&disabled for
probe_fw() during pci_probe() and are then not requested until
mac80211 start().


Pozdrawiam / Best regards,
Michał Kazior.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] ath10k: move irq setup
@ 2013-07-31  5:50     ` Michal Kazior
  0 siblings, 0 replies; 18+ messages in thread
From: Michal Kazior @ 2013-07-31  5:50 UTC (permalink / raw)
  To: Kalle Valo; +Cc: linux-wireless, ath10k

On 30 July 2013 20:35, Kalle Valo <kvalo@qca.qualcomm.com> wrote:
> Michal Kazior <michal.kazior@tieto.com> writes:
>
>> There was a slight race during PCI shutdown. Since
>> interrupts weren't really stopped (only Copy
>> Engine interrupts were disabled through device hw
>> registers) it was possible for a firmware
>> indication (crash) interrupt to come in after
>> tasklets were synced/killed. This would cause
>> memory corruption and a panic in most cases. It
>> was also possible for interrupt to come before CE
>> was initialized during device probing.
>>
>> Interrupts are required for BMI phase so they are enabled as soon as
>> power_up() is called but are freed upon both power_down() and stop()
>> so there's asymmetry here. As by design stop() cannot be followed by
>> start() it is okay. Both power_down() and stop() should be merged
>> later on to avoid confusion.
>
> Why are the interrupts freed both in power_down() and stop()? I don't
> get that.
>
> What if we call disable_irq() in power_down() instead?

power_down() must call free_irq(), because power_up() calls
request_irq() (if you want the symmetry). If anything, the stop()
should call disable_irq(), but wouldn't that mean start() should call
enable_irq()? But than, irqs are needed before start()..

I did think about disable_irq() but AFAIR you need to enable_irq()
later on (so either way you need to keep track of the irq state or
you'll get a ton of WARN_ONs from the system). I'll double check that
and report back later


>> Before this can be really properly fixed var/hw
>> init code split is necessary.
>>
>> Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
>> ---
>>
>> Please note: this is based on my (still under
>> review at the time of posting) previous patchests:
>> device setup refactor and recovery.
>>
>> I'm posting this before those patchsets are merged
>> so anyone interested in testing this fix (I can't
>> reproduce the problem on my setup) can give it a
>> try.
>
> This was reported by Ben, right? So this sould have a Reported-by line
> attributing him.

Yes. I'll fix that, provided we get through the review with the patch :)


>> @@ -1783,16 +1792,24 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
>>       return 0;
>>
>>  err_ce:
>> +     /* XXX: Until var/hw init is split it's impossible to fix the ordering
>> +      * here so we must call stop_intr() here too to prevent interrupts after
>> +      * CE is teared down. It's okay to double call the stop_intr()
>> */
>
> "FIXME:"

Ok.



>>  exit:
>> +     ar_pci->intr_started = ret == 0;
>
> A bit too clever for the sake of readibility for my taste, but I guess
> it's ok.
>
>> --- a/drivers/net/wireless/ath/ath10k/pci.h
>> +++ b/drivers/net/wireless/ath/ath10k/pci.h
>> @@ -198,6 +198,7 @@ struct ath10k_pci {
>>        * interrupts.
>>        */
>>       int num_msi_intrs;
>> +     bool intr_started;
>
> Adding a new state variable makes me worried. I really would prefer a
> solution which would not require that.

I know that. That's why I mentioned in the commit log that it is more
of a workaround than a real fix. Me, I don't like this either but a
real fix requires a lot of rework from what I can tell.

This bug can be triggered more easily now apparently after recovery
patches went in. I'm not experiencing it but I get reports of rare
panics when a machine is left idle for a very long time with
interfaces brought down.


> Also if we call request_irq() in ath10k_pci_probe() we should also call
> free_irq() in ath10k_pci_remove() for symmetry. Just doing a temporary
> hack will most likely stay forever :)

With the patch interrupts are temporarily enabled&disabled for
probe_fw() during pci_probe() and are then not requested until
mac80211 start().


Pozdrawiam / Best regards,
Michał Kazior.

_______________________________________________
ath10k mailing list
ath10k@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/ath10k

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] ath10k: move irq setup
  2013-07-31  5:50     ` Michal Kazior
@ 2013-07-31 10:50       ` Michal Kazior
  -1 siblings, 0 replies; 18+ messages in thread
From: Michal Kazior @ 2013-07-31 10:50 UTC (permalink / raw)
  To: Kalle Valo; +Cc: ath10k, linux-wireless

On 31 July 2013 07:50, Michal Kazior <michal.kazior@tieto.com> wrote:
> On 30 July 2013 20:35, Kalle Valo <kvalo@qca.qualcomm.com> wrote:
>> Michal Kazior <michal.kazior@tieto.com> writes:
>>
>>> There was a slight race during PCI shutdown. Since
>>> interrupts weren't really stopped (only Copy
>>> Engine interrupts were disabled through device hw
>>> registers) it was possible for a firmware
>>> indication (crash) interrupt to come in after
>>> tasklets were synced/killed. This would cause
>>> memory corruption and a panic in most cases. It
>>> was also possible for interrupt to come before CE
>>> was initialized during device probing.
>>>
>>> Interrupts are required for BMI phase so they are enabled as soon as
>>> power_up() is called but are freed upon both power_down() and stop()
>>> so there's asymmetry here. As by design stop() cannot be followed by
>>> start() it is okay. Both power_down() and stop() should be merged
>>> later on to avoid confusion.
>>
>> Why are the interrupts freed both in power_down() and stop()? I don't
>> get that.
>>
>> What if we call disable_irq() in power_down() instead?
>
> power_down() must call free_irq(), because power_up() calls
> request_irq() (if you want the symmetry). If anything, the stop()
> should call disable_irq(), but wouldn't that mean start() should call
> enable_irq()? But than, irqs are needed before start()..
>
> I did think about disable_irq() but AFAIR you need to enable_irq()
> later on (so either way you need to keep track of the irq state or
> you'll get a ton of WARN_ONs from the system). I'll double check that
> and report back later

enable/disable_irq must be balanced as well.

There are two cases of power cycle:
 * power_up, power_down
 * power_up, start, stop, power_down

If irq setup is moved from pci_probe/remove to power_up/power_down,
then stop() still needs irqs to be halted - either disable_irq, or
free_irq. In the latter case power_down must be prepared and not issue
free_irq again.

If irq setup remains in pci_probe/remove then both stop() and
power_down() need irqs to be halted too. Same issue applies.

If stop/power_down is merged than the whole problem is solved. This
seems like the sane solution to the whole problem but requires some
refactoring to be done first.


Pozdrawiam / Best regards,
Michał Kazior.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] ath10k: move irq setup
@ 2013-07-31 10:50       ` Michal Kazior
  0 siblings, 0 replies; 18+ messages in thread
From: Michal Kazior @ 2013-07-31 10:50 UTC (permalink / raw)
  To: Kalle Valo; +Cc: linux-wireless, ath10k

On 31 July 2013 07:50, Michal Kazior <michal.kazior@tieto.com> wrote:
> On 30 July 2013 20:35, Kalle Valo <kvalo@qca.qualcomm.com> wrote:
>> Michal Kazior <michal.kazior@tieto.com> writes:
>>
>>> There was a slight race during PCI shutdown. Since
>>> interrupts weren't really stopped (only Copy
>>> Engine interrupts were disabled through device hw
>>> registers) it was possible for a firmware
>>> indication (crash) interrupt to come in after
>>> tasklets were synced/killed. This would cause
>>> memory corruption and a panic in most cases. It
>>> was also possible for interrupt to come before CE
>>> was initialized during device probing.
>>>
>>> Interrupts are required for BMI phase so they are enabled as soon as
>>> power_up() is called but are freed upon both power_down() and stop()
>>> so there's asymmetry here. As by design stop() cannot be followed by
>>> start() it is okay. Both power_down() and stop() should be merged
>>> later on to avoid confusion.
>>
>> Why are the interrupts freed both in power_down() and stop()? I don't
>> get that.
>>
>> What if we call disable_irq() in power_down() instead?
>
> power_down() must call free_irq(), because power_up() calls
> request_irq() (if you want the symmetry). If anything, the stop()
> should call disable_irq(), but wouldn't that mean start() should call
> enable_irq()? But than, irqs are needed before start()..
>
> I did think about disable_irq() but AFAIR you need to enable_irq()
> later on (so either way you need to keep track of the irq state or
> you'll get a ton of WARN_ONs from the system). I'll double check that
> and report back later

enable/disable_irq must be balanced as well.

There are two cases of power cycle:
 * power_up, power_down
 * power_up, start, stop, power_down

If irq setup is moved from pci_probe/remove to power_up/power_down,
then stop() still needs irqs to be halted - either disable_irq, or
free_irq. In the latter case power_down must be prepared and not issue
free_irq again.

If irq setup remains in pci_probe/remove then both stop() and
power_down() need irqs to be halted too. Same issue applies.

If stop/power_down is merged than the whole problem is solved. This
seems like the sane solution to the whole problem but requires some
refactoring to be done first.


Pozdrawiam / Best regards,
Michał Kazior.

_______________________________________________
ath10k mailing list
ath10k@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/ath10k

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v2] ath10k: fix device teardown
  2013-07-18  6:39 ` Michal Kazior
@ 2013-08-02  7:15   ` Michal Kazior
  -1 siblings, 0 replies; 18+ messages in thread
From: Michal Kazior @ 2013-08-02  7:15 UTC (permalink / raw)
  To: ath10k; +Cc: linux-wireless, Michal Kazior

This fixes interrupt-related issue when no
interfaces were running thus the device was
considered powered down.

The power_down() function isn't really powering
down the device. It simply assumed it won't
interrupt. This wasn't true in some cases and
could lead to paging failures upon FW indication
interrupt (i.e. FW crash) because some structures
aren't allocated in that device state.

One reason for that was that ar_pci->started
wasn't reset. The other is interrupts should've
been masked when teardown starts.

The patch reorganized interrupt setup and makes
sure ar_pci->started is reset accordingly.

Reported-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
---
v2:
 * updated commit message
 * added Reported-By: Ben
 * added disable_irq() in hif_stop()
 * added ar_pci->started resetting
 * removed ar_pci->intr_started

 drivers/net/wireless/ath/ath10k/pci.c |   41 ++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/pci.c b/drivers/net/wireless/ath/ath10k/pci.c
index c71b488..690a8b4 100644
--- a/drivers/net/wireless/ath/ath10k/pci.c
+++ b/drivers/net/wireless/ath/ath10k/pci.c
@@ -56,6 +56,8 @@ static void ath10k_pci_rx_pipe_cleanup(struct hif_ce_pipe_info *pipe_info);
 static void ath10k_pci_stop_ce(struct ath10k *ar);
 static void ath10k_pci_device_reset(struct ath10k *ar);
 static int ath10k_pci_reset_target(struct ath10k *ar);
+static int ath10k_pci_start_intr(struct ath10k *ar);
+static void ath10k_pci_stop_intr(struct ath10k *ar);
 
 static const struct ce_attr host_ce_config_wlan[] = {
 	/* host->target HTC control and raw streams */
@@ -1254,10 +1256,25 @@ static void ath10k_pci_ce_deinit(struct ath10k *ar)
 	}
 }
 
+static void ath10k_pci_disable_irqs(struct ath10k *ar)
+{
+	struct ath10k_pci *ar_pci = ath10k_pci_priv(ar);
+	int i;
+
+	for (i = 0; i < max(1, ar_pci->num_msi_intrs); i++)
+		disable_irq(ar_pci->pdev->irq + i);
+}
+
 static void ath10k_pci_hif_stop(struct ath10k *ar)
 {
+	struct ath10k_pci *ar_pci = ath10k_pci_priv(ar);
+
 	ath10k_dbg(ATH10K_DBG_PCI, "%s\n", __func__);
 
+	/* Irqs are never explicitly re-enabled. They are implicitly re-enabled
+	 * by ath10k_pci_start_intr(). */
+	ath10k_pci_disable_irqs(ar);
+
 	ath10k_pci_stop_ce(ar);
 
 	/* At this point, asynchronous threads are stopped, the target should
@@ -1267,6 +1284,8 @@ static void ath10k_pci_hif_stop(struct ath10k *ar)
 	ath10k_pci_process_ce(ar);
 	ath10k_pci_cleanup_ce(ar);
 	ath10k_pci_buffer_cleanup(ar);
+
+	ar_pci->started = 0;
 }
 
 static int ath10k_pci_hif_exchange_bmi_msg(struct ath10k *ar,
@@ -1742,6 +1761,12 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
 {
 	int ret;
 
+	ret = ath10k_pci_start_intr(ar);
+	if (ret) {
+		ath10k_err("could not start interrupt handling (%d)\n", ret);
+		goto err;
+	}
+
 	/*
 	 * Bring the target up cleanly.
 	 *
@@ -1756,7 +1781,7 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
 
 	ret = ath10k_pci_reset_target(ar);
 	if (ret)
-		goto err;
+		goto err_irq;
 
 	if (ath10k_target_ps) {
 		ath10k_dbg(ATH10K_DBG_PCI, "on-chip power save enabled\n");
@@ -1787,12 +1812,15 @@ err_ce:
 err_ps:
 	if (!ath10k_target_ps)
 		ath10k_do_pci_sleep(ar);
+err_irq:
+	ath10k_pci_stop_intr(ar);
 err:
 	return ret;
 }
 
 static void ath10k_pci_hif_power_down(struct ath10k *ar)
 {
+	ath10k_pci_stop_intr(ar);
 	ath10k_pci_ce_deinit(ar);
 	if (!ath10k_target_ps)
 		ath10k_do_pci_sleep(ar);
@@ -2358,22 +2386,14 @@ static int ath10k_pci_probe(struct pci_dev *pdev,
 
 	ar_pci->cacheline_sz = dma_get_cache_alignment();
 
-	ret = ath10k_pci_start_intr(ar);
-	if (ret) {
-		ath10k_err("could not start interrupt handling (%d)\n", ret);
-		goto err_iomap;
-	}
-
 	ret = ath10k_core_register(ar);
 	if (ret) {
 		ath10k_err("could not register driver core (%d)\n", ret);
-		goto err_intr;
+		goto err_iomap;
 	}
 
 	return 0;
 
-err_intr:
-	ath10k_pci_stop_intr(ar);
 err_iomap:
 	pci_iounmap(pdev, mem);
 err_master:
@@ -2410,7 +2430,6 @@ static void ath10k_pci_remove(struct pci_dev *pdev)
 	tasklet_kill(&ar_pci->msi_fw_err);
 
 	ath10k_core_unregister(ar);
-	ath10k_pci_stop_intr(ar);
 
 	pci_set_drvdata(pdev, NULL);
 	pci_iounmap(pdev, ar_pci->mem);
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2] ath10k: fix device teardown
@ 2013-08-02  7:15   ` Michal Kazior
  0 siblings, 0 replies; 18+ messages in thread
From: Michal Kazior @ 2013-08-02  7:15 UTC (permalink / raw)
  To: ath10k; +Cc: linux-wireless, Michal Kazior

This fixes interrupt-related issue when no
interfaces were running thus the device was
considered powered down.

The power_down() function isn't really powering
down the device. It simply assumed it won't
interrupt. This wasn't true in some cases and
could lead to paging failures upon FW indication
interrupt (i.e. FW crash) because some structures
aren't allocated in that device state.

One reason for that was that ar_pci->started
wasn't reset. The other is interrupts should've
been masked when teardown starts.

The patch reorganized interrupt setup and makes
sure ar_pci->started is reset accordingly.

Reported-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
---
v2:
 * updated commit message
 * added Reported-By: Ben
 * added disable_irq() in hif_stop()
 * added ar_pci->started resetting
 * removed ar_pci->intr_started

 drivers/net/wireless/ath/ath10k/pci.c |   41 ++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/pci.c b/drivers/net/wireless/ath/ath10k/pci.c
index c71b488..690a8b4 100644
--- a/drivers/net/wireless/ath/ath10k/pci.c
+++ b/drivers/net/wireless/ath/ath10k/pci.c
@@ -56,6 +56,8 @@ static void ath10k_pci_rx_pipe_cleanup(struct hif_ce_pipe_info *pipe_info);
 static void ath10k_pci_stop_ce(struct ath10k *ar);
 static void ath10k_pci_device_reset(struct ath10k *ar);
 static int ath10k_pci_reset_target(struct ath10k *ar);
+static int ath10k_pci_start_intr(struct ath10k *ar);
+static void ath10k_pci_stop_intr(struct ath10k *ar);
 
 static const struct ce_attr host_ce_config_wlan[] = {
 	/* host->target HTC control and raw streams */
@@ -1254,10 +1256,25 @@ static void ath10k_pci_ce_deinit(struct ath10k *ar)
 	}
 }
 
+static void ath10k_pci_disable_irqs(struct ath10k *ar)
+{
+	struct ath10k_pci *ar_pci = ath10k_pci_priv(ar);
+	int i;
+
+	for (i = 0; i < max(1, ar_pci->num_msi_intrs); i++)
+		disable_irq(ar_pci->pdev->irq + i);
+}
+
 static void ath10k_pci_hif_stop(struct ath10k *ar)
 {
+	struct ath10k_pci *ar_pci = ath10k_pci_priv(ar);
+
 	ath10k_dbg(ATH10K_DBG_PCI, "%s\n", __func__);
 
+	/* Irqs are never explicitly re-enabled. They are implicitly re-enabled
+	 * by ath10k_pci_start_intr(). */
+	ath10k_pci_disable_irqs(ar);
+
 	ath10k_pci_stop_ce(ar);
 
 	/* At this point, asynchronous threads are stopped, the target should
@@ -1267,6 +1284,8 @@ static void ath10k_pci_hif_stop(struct ath10k *ar)
 	ath10k_pci_process_ce(ar);
 	ath10k_pci_cleanup_ce(ar);
 	ath10k_pci_buffer_cleanup(ar);
+
+	ar_pci->started = 0;
 }
 
 static int ath10k_pci_hif_exchange_bmi_msg(struct ath10k *ar,
@@ -1742,6 +1761,12 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
 {
 	int ret;
 
+	ret = ath10k_pci_start_intr(ar);
+	if (ret) {
+		ath10k_err("could not start interrupt handling (%d)\n", ret);
+		goto err;
+	}
+
 	/*
 	 * Bring the target up cleanly.
 	 *
@@ -1756,7 +1781,7 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
 
 	ret = ath10k_pci_reset_target(ar);
 	if (ret)
-		goto err;
+		goto err_irq;
 
 	if (ath10k_target_ps) {
 		ath10k_dbg(ATH10K_DBG_PCI, "on-chip power save enabled\n");
@@ -1787,12 +1812,15 @@ err_ce:
 err_ps:
 	if (!ath10k_target_ps)
 		ath10k_do_pci_sleep(ar);
+err_irq:
+	ath10k_pci_stop_intr(ar);
 err:
 	return ret;
 }
 
 static void ath10k_pci_hif_power_down(struct ath10k *ar)
 {
+	ath10k_pci_stop_intr(ar);
 	ath10k_pci_ce_deinit(ar);
 	if (!ath10k_target_ps)
 		ath10k_do_pci_sleep(ar);
@@ -2358,22 +2386,14 @@ static int ath10k_pci_probe(struct pci_dev *pdev,
 
 	ar_pci->cacheline_sz = dma_get_cache_alignment();
 
-	ret = ath10k_pci_start_intr(ar);
-	if (ret) {
-		ath10k_err("could not start interrupt handling (%d)\n", ret);
-		goto err_iomap;
-	}
-
 	ret = ath10k_core_register(ar);
 	if (ret) {
 		ath10k_err("could not register driver core (%d)\n", ret);
-		goto err_intr;
+		goto err_iomap;
 	}
 
 	return 0;
 
-err_intr:
-	ath10k_pci_stop_intr(ar);
 err_iomap:
 	pci_iounmap(pdev, mem);
 err_master:
@@ -2410,7 +2430,6 @@ static void ath10k_pci_remove(struct pci_dev *pdev)
 	tasklet_kill(&ar_pci->msi_fw_err);
 
 	ath10k_core_unregister(ar);
-	ath10k_pci_stop_intr(ar);
 
 	pci_set_drvdata(pdev, NULL);
 	pci_iounmap(pdev, ar_pci->mem);
-- 
1.7.9.5


_______________________________________________
ath10k mailing list
ath10k@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/ath10k

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] ath10k: fix device teardown
  2013-08-02  7:15   ` Michal Kazior
@ 2013-08-02  7:41     ` Kalle Valo
  -1 siblings, 0 replies; 18+ messages in thread
From: Kalle Valo @ 2013-08-02  7:41 UTC (permalink / raw)
  To: Michal Kazior; +Cc: ath10k, linux-wireless

Michal Kazior <michal.kazior@tieto.com> writes:

> This fixes interrupt-related issue when no
> interfaces were running thus the device was
> considered powered down.
>
> The power_down() function isn't really powering
> down the device. It simply assumed it won't
> interrupt. This wasn't true in some cases and
> could lead to paging failures upon FW indication
> interrupt (i.e. FW crash) because some structures
> aren't allocated in that device state.
>
> One reason for that was that ar_pci->started
> wasn't reset. The other is interrupts should've
> been masked when teardown starts.
>
> The patch reorganized interrupt setup and makes
> sure ar_pci->started is reset accordingly.
>
> Reported-by: Ben Greear <greearb@candelatech.com>
> Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
> ---
> v2:
>  * updated commit message
>  * added Reported-By: Ben
>  * added disable_irq() in hif_stop()
>  * added ar_pci->started resetting
>  * removed ar_pci->intr_started

Thanks, this looks much better now.

But I still have one question:

> @@ -1742,6 +1761,12 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
>  {
>  	int ret;
>  
> +	ret = ath10k_pci_start_intr(ar);
> +	if (ret) {
> +		ath10k_err("could not start interrupt handling (%d)\n", ret);
> +		goto err;
> +	}

So now we call start_intr() during power_up(), which means that we do
the request_irq() calls during every interface up event. Does that cause
any meaningful overhead?

For me it looks better to do all resource allocation in
ath10k_pci_probe(), like request_irq(), and free the resources in
ath10k_pci_remove(). But then we would need to immeadiately call
disable_irq() and then enable_irq() from power_up() so I'm not sure if
that's any better.

-- 
Kalle Valo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] ath10k: fix device teardown
@ 2013-08-02  7:41     ` Kalle Valo
  0 siblings, 0 replies; 18+ messages in thread
From: Kalle Valo @ 2013-08-02  7:41 UTC (permalink / raw)
  To: Michal Kazior; +Cc: linux-wireless, ath10k

Michal Kazior <michal.kazior@tieto.com> writes:

> This fixes interrupt-related issue when no
> interfaces were running thus the device was
> considered powered down.
>
> The power_down() function isn't really powering
> down the device. It simply assumed it won't
> interrupt. This wasn't true in some cases and
> could lead to paging failures upon FW indication
> interrupt (i.e. FW crash) because some structures
> aren't allocated in that device state.
>
> One reason for that was that ar_pci->started
> wasn't reset. The other is interrupts should've
> been masked when teardown starts.
>
> The patch reorganized interrupt setup and makes
> sure ar_pci->started is reset accordingly.
>
> Reported-by: Ben Greear <greearb@candelatech.com>
> Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
> ---
> v2:
>  * updated commit message
>  * added Reported-By: Ben
>  * added disable_irq() in hif_stop()
>  * added ar_pci->started resetting
>  * removed ar_pci->intr_started

Thanks, this looks much better now.

But I still have one question:

> @@ -1742,6 +1761,12 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
>  {
>  	int ret;
>  
> +	ret = ath10k_pci_start_intr(ar);
> +	if (ret) {
> +		ath10k_err("could not start interrupt handling (%d)\n", ret);
> +		goto err;
> +	}

So now we call start_intr() during power_up(), which means that we do
the request_irq() calls during every interface up event. Does that cause
any meaningful overhead?

For me it looks better to do all resource allocation in
ath10k_pci_probe(), like request_irq(), and free the resources in
ath10k_pci_remove(). But then we would need to immeadiately call
disable_irq() and then enable_irq() from power_up() so I'm not sure if
that's any better.

-- 
Kalle Valo

_______________________________________________
ath10k mailing list
ath10k@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/ath10k

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] ath10k: fix device teardown
  2013-08-02  7:41     ` Kalle Valo
@ 2013-08-02  7:51       ` Michal Kazior
  -1 siblings, 0 replies; 18+ messages in thread
From: Michal Kazior @ 2013-08-02  7:51 UTC (permalink / raw)
  To: Kalle Valo; +Cc: ath10k, linux-wireless

On 2 August 2013 09:41, Kalle Valo <kvalo@qca.qualcomm.com> wrote:
> Michal Kazior <michal.kazior@tieto.com> writes:
>
>> This fixes interrupt-related issue when no
>> interfaces were running thus the device was
>> considered powered down.
>>
>> The power_down() function isn't really powering
>> down the device. It simply assumed it won't
>> interrupt. This wasn't true in some cases and
>> could lead to paging failures upon FW indication
>> interrupt (i.e. FW crash) because some structures
>> aren't allocated in that device state.
>>
>> One reason for that was that ar_pci->started
>> wasn't reset. The other is interrupts should've
>> been masked when teardown starts.
>>
>> The patch reorganized interrupt setup and makes
>> sure ar_pci->started is reset accordingly.
>>
>> Reported-by: Ben Greear <greearb@candelatech.com>
>> Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
>> ---
>> v2:
>>  * updated commit message
>>  * added Reported-By: Ben
>>  * added disable_irq() in hif_stop()
>>  * added ar_pci->started resetting
>>  * removed ar_pci->intr_started
>
> Thanks, this looks much better now.
>
> But I still have one question:
>
>> @@ -1742,6 +1761,12 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
>>  {
>>       int ret;
>>
>> +     ret = ath10k_pci_start_intr(ar);
>> +     if (ret) {
>> +             ath10k_err("could not start interrupt handling (%d)\n", ret);
>> +             goto err;
>> +     }
>
> So now we call start_intr() during power_up(), which means that we do
> the request_irq() calls during every interface up event. Does that cause
> any meaningful overhead?

I don't think so.


> For me it looks better to do all resource allocation in
> ath10k_pci_probe(), like request_irq(), and free the resources in
> ath10k_pci_remove(). But then we would need to immeadiately call
> disable_irq() and then enable_irq() from power_up() so I'm not sure if
> that's any better.

Not only that. Since disable/enable_irq must be balanced we'd need
some way to track whether we have irqs enabled/disabled - either with
an extra bool variable, additional ath10k_states or new pci-specific
states.

The patch assumes disable_irq is followed by free_irq (which it is)
and possibly request_irq later on.


Pozdrawiam / Best regards,
Michał Kazior.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] ath10k: fix device teardown
@ 2013-08-02  7:51       ` Michal Kazior
  0 siblings, 0 replies; 18+ messages in thread
From: Michal Kazior @ 2013-08-02  7:51 UTC (permalink / raw)
  To: Kalle Valo; +Cc: linux-wireless, ath10k

On 2 August 2013 09:41, Kalle Valo <kvalo@qca.qualcomm.com> wrote:
> Michal Kazior <michal.kazior@tieto.com> writes:
>
>> This fixes interrupt-related issue when no
>> interfaces were running thus the device was
>> considered powered down.
>>
>> The power_down() function isn't really powering
>> down the device. It simply assumed it won't
>> interrupt. This wasn't true in some cases and
>> could lead to paging failures upon FW indication
>> interrupt (i.e. FW crash) because some structures
>> aren't allocated in that device state.
>>
>> One reason for that was that ar_pci->started
>> wasn't reset. The other is interrupts should've
>> been masked when teardown starts.
>>
>> The patch reorganized interrupt setup and makes
>> sure ar_pci->started is reset accordingly.
>>
>> Reported-by: Ben Greear <greearb@candelatech.com>
>> Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
>> ---
>> v2:
>>  * updated commit message
>>  * added Reported-By: Ben
>>  * added disable_irq() in hif_stop()
>>  * added ar_pci->started resetting
>>  * removed ar_pci->intr_started
>
> Thanks, this looks much better now.
>
> But I still have one question:
>
>> @@ -1742,6 +1761,12 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
>>  {
>>       int ret;
>>
>> +     ret = ath10k_pci_start_intr(ar);
>> +     if (ret) {
>> +             ath10k_err("could not start interrupt handling (%d)\n", ret);
>> +             goto err;
>> +     }
>
> So now we call start_intr() during power_up(), which means that we do
> the request_irq() calls during every interface up event. Does that cause
> any meaningful overhead?

I don't think so.


> For me it looks better to do all resource allocation in
> ath10k_pci_probe(), like request_irq(), and free the resources in
> ath10k_pci_remove(). But then we would need to immeadiately call
> disable_irq() and then enable_irq() from power_up() so I'm not sure if
> that's any better.

Not only that. Since disable/enable_irq must be balanced we'd need
some way to track whether we have irqs enabled/disabled - either with
an extra bool variable, additional ath10k_states or new pci-specific
states.

The patch assumes disable_irq is followed by free_irq (which it is)
and possibly request_irq later on.


Pozdrawiam / Best regards,
Michał Kazior.

_______________________________________________
ath10k mailing list
ath10k@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/ath10k

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] ath10k: fix device teardown
  2013-08-02  7:51       ` Michal Kazior
@ 2013-08-02  8:00         ` Kalle Valo
  -1 siblings, 0 replies; 18+ messages in thread
From: Kalle Valo @ 2013-08-02  8:00 UTC (permalink / raw)
  To: Michal Kazior; +Cc: ath10k, linux-wireless

Michal Kazior <michal.kazior@tieto.com> writes:

> On 2 August 2013 09:41, Kalle Valo <kvalo@qca.qualcomm.com> wrote:
>> Michal Kazior <michal.kazior@tieto.com> writes:
>>
>>> @@ -1742,6 +1761,12 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
>>>  {
>>>       int ret;
>>>
>>> +     ret = ath10k_pci_start_intr(ar);
>>> +     if (ret) {
>>> +             ath10k_err("could not start interrupt handling (%d)\n", ret);
>>> +             goto err;
>>> +     }
>>
>> So now we call start_intr() during power_up(), which means that we do
>> the request_irq() calls during every interface up event. Does that cause
>> any meaningful overhead?
>
> I don't think so.

Good.

>> For me it looks better to do all resource allocation in
>> ath10k_pci_probe(), like request_irq(), and free the resources in
>> ath10k_pci_remove(). But then we would need to immeadiately call
>> disable_irq() and then enable_irq() from power_up() so I'm not sure if
>> that's any better.
>
> Not only that. Since disable/enable_irq must be balanced we'd need
> some way to track whether we have irqs enabled/disabled - either with
> an extra bool variable, additional ath10k_states or new pci-specific
> states.
>
> The patch assumes disable_irq is followed by free_irq (which it is)
> and possibly request_irq later on.

Yeah, your v2 sounds much better. And if there's overhead or something
else we can always change this later.

I'll wait for comments from others and if I don't get any, I'll apply
this.

-- 
Kalle Valo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] ath10k: fix device teardown
@ 2013-08-02  8:00         ` Kalle Valo
  0 siblings, 0 replies; 18+ messages in thread
From: Kalle Valo @ 2013-08-02  8:00 UTC (permalink / raw)
  To: Michal Kazior; +Cc: linux-wireless, ath10k

Michal Kazior <michal.kazior@tieto.com> writes:

> On 2 August 2013 09:41, Kalle Valo <kvalo@qca.qualcomm.com> wrote:
>> Michal Kazior <michal.kazior@tieto.com> writes:
>>
>>> @@ -1742,6 +1761,12 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar)
>>>  {
>>>       int ret;
>>>
>>> +     ret = ath10k_pci_start_intr(ar);
>>> +     if (ret) {
>>> +             ath10k_err("could not start interrupt handling (%d)\n", ret);
>>> +             goto err;
>>> +     }
>>
>> So now we call start_intr() during power_up(), which means that we do
>> the request_irq() calls during every interface up event. Does that cause
>> any meaningful overhead?
>
> I don't think so.

Good.

>> For me it looks better to do all resource allocation in
>> ath10k_pci_probe(), like request_irq(), and free the resources in
>> ath10k_pci_remove(). But then we would need to immeadiately call
>> disable_irq() and then enable_irq() from power_up() so I'm not sure if
>> that's any better.
>
> Not only that. Since disable/enable_irq must be balanced we'd need
> some way to track whether we have irqs enabled/disabled - either with
> an extra bool variable, additional ath10k_states or new pci-specific
> states.
>
> The patch assumes disable_irq is followed by free_irq (which it is)
> and possibly request_irq later on.

Yeah, your v2 sounds much better. And if there's overhead or something
else we can always change this later.

I'll wait for comments from others and if I don't get any, I'll apply
this.

-- 
Kalle Valo

_______________________________________________
ath10k mailing list
ath10k@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/ath10k

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] ath10k: fix device teardown
  2013-08-02  7:15   ` Michal Kazior
@ 2013-08-05 16:23     ` Kalle Valo
  -1 siblings, 0 replies; 18+ messages in thread
From: Kalle Valo @ 2013-08-05 16:23 UTC (permalink / raw)
  To: Michal Kazior; +Cc: ath10k, linux-wireless

Michal Kazior <michal.kazior@tieto.com> writes:

> This fixes interrupt-related issue when no
> interfaces were running thus the device was
> considered powered down.
>
> The power_down() function isn't really powering
> down the device. It simply assumed it won't
> interrupt. This wasn't true in some cases and
> could lead to paging failures upon FW indication
> interrupt (i.e. FW crash) because some structures
> aren't allocated in that device state.
>
> One reason for that was that ar_pci->started
> wasn't reset. The other is interrupts should've
> been masked when teardown starts.
>
> The patch reorganized interrupt setup and makes
> sure ar_pci->started is reset accordingly.
>
> Reported-by: Ben Greear <greearb@candelatech.com>
> Signed-off-by: Michal Kazior <michal.kazior@tieto.com>

Thanks, applied.

-- 
Kalle Valo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2] ath10k: fix device teardown
@ 2013-08-05 16:23     ` Kalle Valo
  0 siblings, 0 replies; 18+ messages in thread
From: Kalle Valo @ 2013-08-05 16:23 UTC (permalink / raw)
  To: Michal Kazior; +Cc: linux-wireless, ath10k

Michal Kazior <michal.kazior@tieto.com> writes:

> This fixes interrupt-related issue when no
> interfaces were running thus the device was
> considered powered down.
>
> The power_down() function isn't really powering
> down the device. It simply assumed it won't
> interrupt. This wasn't true in some cases and
> could lead to paging failures upon FW indication
> interrupt (i.e. FW crash) because some structures
> aren't allocated in that device state.
>
> One reason for that was that ar_pci->started
> wasn't reset. The other is interrupts should've
> been masked when teardown starts.
>
> The patch reorganized interrupt setup and makes
> sure ar_pci->started is reset accordingly.
>
> Reported-by: Ben Greear <greearb@candelatech.com>
> Signed-off-by: Michal Kazior <michal.kazior@tieto.com>

Thanks, applied.

-- 
Kalle Valo

_______________________________________________
ath10k mailing list
ath10k@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/ath10k

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2013-08-05 16:24 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-07-18  6:39 [PATCH] ath10k: move irq setup Michal Kazior
2013-07-18  6:39 ` Michal Kazior
2013-07-30 18:35 ` Kalle Valo
2013-07-30 18:35   ` Kalle Valo
2013-07-31  5:50   ` Michal Kazior
2013-07-31  5:50     ` Michal Kazior
2013-07-31 10:50     ` Michal Kazior
2013-07-31 10:50       ` Michal Kazior
2013-08-02  7:15 ` [PATCH v2] ath10k: fix device teardown Michal Kazior
2013-08-02  7:15   ` Michal Kazior
2013-08-02  7:41   ` Kalle Valo
2013-08-02  7:41     ` Kalle Valo
2013-08-02  7:51     ` Michal Kazior
2013-08-02  7:51       ` Michal Kazior
2013-08-02  8:00       ` Kalle Valo
2013-08-02  8:00         ` Kalle Valo
2013-08-05 16:23   ` Kalle Valo
2013-08-05 16:23     ` Kalle Valo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.