linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 09/30] cxlflash: Fix to stop interrupt processing on remove
@ 2015-09-16 16:53 Matthew R. Ochs
  2015-09-17 12:38 ` Tomas Henzl
  0 siblings, 1 reply; 10+ messages in thread
From: Matthew R. Ochs @ 2015-09-16 16:53 UTC (permalink / raw)
  To: linux-scsi, James.Bottomley, nab, brking, imunsie, dja, andrew.donnellan
  Cc: mikey, linuxppc-dev, Manoj N. Kumar

Interrupt processing can run in parallel to a remove operation. This
can lead to a condition where the interrupt handler is processing with
memory that has been freed.
    
To avoid processing an interrupt while memory may be yanked, check for
removal while in the interrupt handler. Bail when removal is imminent.

Signed-off-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
Signed-off-by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>
---
 drivers/scsi/cxlflash/common.h |  2 ++
 drivers/scsi/cxlflash/main.c   | 21 +++++++++++++++------
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/drivers/scsi/cxlflash/common.h b/drivers/scsi/cxlflash/common.h
index 1abe4e0..03d2cc6 100644
--- a/drivers/scsi/cxlflash/common.h
+++ b/drivers/scsi/cxlflash/common.h
@@ -103,6 +103,8 @@ struct cxlflash_cfg {
 	enum cxlflash_lr_state lr_state;
 	int lr_port;
 
+	atomic_t remove_active;
+
 	struct cxl_afu *cxl_afu;
 
 	struct pci_pool *cxlflash_cmd_pool;
diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
index 6e85c77..89ee648 100644
--- a/drivers/scsi/cxlflash/main.c
+++ b/drivers/scsi/cxlflash/main.c
@@ -892,6 +892,7 @@ static void cxlflash_remove(struct pci_dev *pdev)
 	spin_unlock_irqrestore(&cfg->tmf_waitq.lock, lock_flags);
 
 	cfg->state = STATE_FAILTERM;
+	atomic_inc(&cfg->remove_active);
 	cxlflash_stop_term_user_contexts(cfg);
 
 	switch (cfg->init_state) {
@@ -1380,16 +1381,20 @@ static void afu_err_intr_init(struct afu *afu)
 static irqreturn_t cxlflash_sync_err_irq(int irq, void *data)
 {
 	struct afu *afu = (struct afu *)data;
+	struct cxlflash_cfg *cfg = afu->parent;
 	u64 reg;
 	u64 reg_unmasked;
 
+	if (atomic_read(&cfg->remove_active))
+		goto out;
+
 	reg = readq_be(&afu->host_map->intr_status);
 	reg_unmasked = (reg & SISL_ISTATUS_UNMASK);
 
 	if (reg_unmasked == 0UL) {
 		pr_err("%s: %llX: spurious interrupt, intr_status %016llX\n",
 		       __func__, (u64)afu, reg);
-		goto cxlflash_sync_err_irq_exit;
+		goto out;
 	}
 
 	pr_err("%s: %llX: unexpected interrupt, intr_status %016llX\n",
@@ -1397,7 +1402,7 @@ static irqreturn_t cxlflash_sync_err_irq(int irq, void *data)
 
 	writeq_be(reg_unmasked, &afu->host_map->intr_clear);
 
-cxlflash_sync_err_irq_exit:
+out:
 	pr_debug("%s: returning rc=%d\n", __func__, IRQ_HANDLED);
 	return IRQ_HANDLED;
 }
@@ -1412,6 +1417,7 @@ cxlflash_sync_err_irq_exit:
 static irqreturn_t cxlflash_rrq_irq(int irq, void *data)
 {
 	struct afu *afu = (struct afu *)data;
+	struct cxlflash_cfg *cfg = afu->parent;
 	struct afu_cmd *cmd;
 	bool toggle = afu->toggle;
 	u64 entry,
@@ -1421,8 +1427,10 @@ static irqreturn_t cxlflash_rrq_irq(int irq, void *data)
 
 	/* Process however many RRQ entries that are ready */
 	while (true) {
-		entry = *hrrq_curr;
+		if (atomic_read(&cfg->remove_active))
+			goto out;
 
+		entry = *hrrq_curr;
 		if ((entry & SISL_RESP_HANDLE_T_BIT) != toggle)
 			break;
 
@@ -1440,7 +1448,7 @@ static irqreturn_t cxlflash_rrq_irq(int irq, void *data)
 
 	afu->hrrq_curr = hrrq_curr;
 	afu->toggle = toggle;
-
+out:
 	return IRQ_HANDLED;
 }
 
@@ -1454,7 +1462,7 @@ static irqreturn_t cxlflash_rrq_irq(int irq, void *data)
 static irqreturn_t cxlflash_async_err_irq(int irq, void *data)
 {
 	struct afu *afu = (struct afu *)data;
-	struct cxlflash_cfg *cfg;
+	struct cxlflash_cfg *cfg = afu->parent;
 	u64 reg_unmasked;
 	const struct asyc_intr_info *info;
 	struct sisl_global_map *global = &afu->afu_map->global;
@@ -1462,7 +1470,8 @@ static irqreturn_t cxlflash_async_err_irq(int irq, void *data)
 	u8 port;
 	int i;
 
-	cfg = afu->parent;
+	if (atomic_read(&cfg->remove_active))
+		goto out;
 
 	reg = readq_be(&global->regs.aintr_status);
 	reg_unmasked = (reg & SISL_ASTATUS_UNMASK);
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 09/30] cxlflash: Fix to stop interrupt processing on remove
  2015-09-16 16:53 [PATCH v2 09/30] cxlflash: Fix to stop interrupt processing on remove Matthew R. Ochs
@ 2015-09-17 12:38 ` Tomas Henzl
  2015-09-17 17:16   ` Matthew R. Ochs
  0 siblings, 1 reply; 10+ messages in thread
From: Tomas Henzl @ 2015-09-17 12:38 UTC (permalink / raw)
  To: Matthew R. Ochs, linux-scsi, James.Bottomley, nab, brking,
	imunsie, dja, andrew.donnellan
  Cc: mikey, linuxppc-dev, Manoj N. Kumar

On 16.9.2015 18:53, Matthew R. Ochs wrote:
> Interrupt processing can run in parallel to a remove operation. This
> can lead to a condition where the interrupt handler is processing with
> memory that has been freed.
>     
> To avoid processing an interrupt while memory may be yanked, check for
> removal while in the interrupt handler. Bail when removal is imminent.
>
> Signed-off-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
> Signed-off-by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>
> ---
>  drivers/scsi/cxlflash/common.h |  2 ++
>  drivers/scsi/cxlflash/main.c   | 21 +++++++++++++++------
>  2 files changed, 17 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/scsi/cxlflash/common.h b/drivers/scsi/cxlflash/common.h
> index 1abe4e0..03d2cc6 100644
> --- a/drivers/scsi/cxlflash/common.h
> +++ b/drivers/scsi/cxlflash/common.h
> @@ -103,6 +103,8 @@ struct cxlflash_cfg {
>  	enum cxlflash_lr_state lr_state;
>  	int lr_port;
>  
> +	atomic_t remove_active;
> +
>  	struct cxl_afu *cxl_afu;
>  
>  	struct pci_pool *cxlflash_cmd_pool;
> diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
> index 6e85c77..89ee648 100644
> --- a/drivers/scsi/cxlflash/main.c
> +++ b/drivers/scsi/cxlflash/main.c
> @@ -892,6 +892,7 @@ static void cxlflash_remove(struct pci_dev *pdev)
>  	spin_unlock_irqrestore(&cfg->tmf_waitq.lock, lock_flags);
>  
>  	cfg->state = STATE_FAILTERM;
> +	atomic_inc(&cfg->remove_active);

Hi Matthew,
you could just call term_afu at this point, this way you don't
need an additional check in all irq functions.
Cheers,
Tomas

>  	cxlflash_stop_term_user_contexts(cfg);
>  
>  	switch (cfg->init_state) {
> @@ -1380,16 +1381,20 @@ static void afu_err_intr_init(struct afu *afu)
>  static irqreturn_t cxlflash_sync_err_irq(int irq, void *data)
>  {
>  	struct afu *afu = (struct afu *)data;
> +	struct cxlflash_cfg *cfg = afu->parent;
>  	u64 reg;
>  	u64 reg_unmasked;
>  
> +	if (atomic_read(&cfg->remove_active))
> +		goto out;
> +
>  	reg = readq_be(&afu->host_map->intr_status);
>  	reg_unmasked = (reg & SISL_ISTATUS_UNMASK);
>  
>  	if (reg_unmasked == 0UL) {
>  		pr_err("%s: %llX: spurious interrupt, intr_status %016llX\n",
>  		       __func__, (u64)afu, reg);
> -		goto cxlflash_sync_err_irq_exit;
> +		goto out;
>  	}
>  
>  	pr_err("%s: %llX: unexpected interrupt, intr_status %016llX\n",
> @@ -1397,7 +1402,7 @@ static irqreturn_t cxlflash_sync_err_irq(int irq, void *data)
>  
>  	writeq_be(reg_unmasked, &afu->host_map->intr_clear);
>  
> -cxlflash_sync_err_irq_exit:
> +out:
>  	pr_debug("%s: returning rc=%d\n", __func__, IRQ_HANDLED);
>  	return IRQ_HANDLED;
>  }
> @@ -1412,6 +1417,7 @@ cxlflash_sync_err_irq_exit:
>  static irqreturn_t cxlflash_rrq_irq(int irq, void *data)
>  {
>  	struct afu *afu = (struct afu *)data;
> +	struct cxlflash_cfg *cfg = afu->parent;
>  	struct afu_cmd *cmd;
>  	bool toggle = afu->toggle;
>  	u64 entry,
> @@ -1421,8 +1427,10 @@ static irqreturn_t cxlflash_rrq_irq(int irq, void *data)
>  
>  	/* Process however many RRQ entries that are ready */
>  	while (true) {
> -		entry = *hrrq_curr;
> +		if (atomic_read(&cfg->remove_active))
> +			goto out;
>  
> +		entry = *hrrq_curr;
>  		if ((entry & SISL_RESP_HANDLE_T_BIT) != toggle)
>  			break;
>  
> @@ -1440,7 +1448,7 @@ static irqreturn_t cxlflash_rrq_irq(int irq, void *data)
>  
>  	afu->hrrq_curr = hrrq_curr;
>  	afu->toggle = toggle;
> -
> +out:
>  	return IRQ_HANDLED;
>  }
>  
> @@ -1454,7 +1462,7 @@ static irqreturn_t cxlflash_rrq_irq(int irq, void *data)
>  static irqreturn_t cxlflash_async_err_irq(int irq, void *data)
>  {
>  	struct afu *afu = (struct afu *)data;
> -	struct cxlflash_cfg *cfg;
> +	struct cxlflash_cfg *cfg = afu->parent;
>  	u64 reg_unmasked;
>  	const struct asyc_intr_info *info;
>  	struct sisl_global_map *global = &afu->afu_map->global;
> @@ -1462,7 +1470,8 @@ static irqreturn_t cxlflash_async_err_irq(int irq, void *data)
>  	u8 port;
>  	int i;
>  
> -	cfg = afu->parent;
> +	if (atomic_read(&cfg->remove_active))
> +		goto out;
>  
>  	reg = readq_be(&global->regs.aintr_status);
>  	reg_unmasked = (reg & SISL_ASTATUS_UNMASK);

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 09/30] cxlflash: Fix to stop interrupt processing on remove
  2015-09-17 12:38 ` Tomas Henzl
@ 2015-09-17 17:16   ` Matthew R. Ochs
  2015-09-18 11:59     ` Tomas Henzl
  0 siblings, 1 reply; 10+ messages in thread
From: Matthew R. Ochs @ 2015-09-17 17:16 UTC (permalink / raw)
  To: Tomas Henzl
  Cc: linux-scsi, James.Bottomley, nab, brking, imunsie, dja,
	andrew.donnellan, mikey, linuxppc-dev, Manoj N. Kumar

> On Sep 17, 2015, at 7:38 AM, Tomas Henzl <thenzl@redhat.com> wrote:
>=20
> On 16.9.2015 18:53, Matthew R. Ochs wrote:
>> Interrupt processing can run in parallel to a remove operation. This
>> can lead to a condition where the interrupt handler is processing =
with
>> memory that has been freed.
>>=20
>> To avoid processing an interrupt while memory may be yanked, check =
for
>> removal while in the interrupt handler. Bail when removal is =
imminent.
>>=20
>> Signed-off-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
>> Signed-off-by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>
>> ---
>> drivers/scsi/cxlflash/common.h |  2 ++
>> drivers/scsi/cxlflash/main.c   | 21 +++++++++++++++------
>> 2 files changed, 17 insertions(+), 6 deletions(-)
>>=20
>> diff --git a/drivers/scsi/cxlflash/common.h =
b/drivers/scsi/cxlflash/common.h
>> index 1abe4e0..03d2cc6 100644
>> --- a/drivers/scsi/cxlflash/common.h
>> +++ b/drivers/scsi/cxlflash/common.h
>> @@ -103,6 +103,8 @@ struct cxlflash_cfg {
>> 	enum cxlflash_lr_state lr_state;
>> 	int lr_port;
>>=20
>> +	atomic_t remove_active;
>> +
>> 	struct cxl_afu *cxl_afu;
>>=20
>> 	struct pci_pool *cxlflash_cmd_pool;
>> diff --git a/drivers/scsi/cxlflash/main.c =
b/drivers/scsi/cxlflash/main.c
>> index 6e85c77..89ee648 100644
>> --- a/drivers/scsi/cxlflash/main.c
>> +++ b/drivers/scsi/cxlflash/main.c
>> @@ -892,6 +892,7 @@ static void cxlflash_remove(struct pci_dev *pdev)
>> 	spin_unlock_irqrestore(&cfg->tmf_waitq.lock, lock_flags);
>>=20
>> 	cfg->state =3D STATE_FAILTERM;
>> +	atomic_inc(&cfg->remove_active);
>=20
> Hi Matthew,
> you could just call term_afu at this point, this way you don't
> need an additional check in all irq functions.
> Cheers,
> Tomas

Hi Tomas,

We actually do call term_afu() a few lines down from here. I don't =
follow
how moving it here would help things.

The reason for the atomic was to provide something lightweight that we
could check _inside_ the processing loop for the read-response queue
handler. A check outside that loop doesn't really provide much in terms
of closing or narrowing down the window of when freed memory can be
accessed.

As David Laight correctly pointed out, this approach does not completely
close the window. We'd need something heavier to fully protect (e.g. a =
lock
to wrap around the entire loop). I will look into adding this in a =
future cycle
when I can adequately test.


-matt

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 09/30] cxlflash: Fix to stop interrupt processing on remove
  2015-09-17 17:16   ` Matthew R. Ochs
@ 2015-09-18 11:59     ` Tomas Henzl
  2015-09-18 23:26       ` Matthew R. Ochs
  0 siblings, 1 reply; 10+ messages in thread
From: Tomas Henzl @ 2015-09-18 11:59 UTC (permalink / raw)
  To: Matthew R. Ochs
  Cc: linux-scsi, James.Bottomley, nab, brking, imunsie, dja,
	andrew.donnellan, mikey, linuxppc-dev, Manoj N. Kumar

On 17.9.2015 19:16, Matthew R. Ochs wrote:
>> On Sep 17, 2015, at 7:38 AM, Tomas Henzl <thenzl@redhat.com> wrote:
>>
>> On 16.9.2015 18:53, Matthew R. Ochs wrote:
>>> Interrupt processing can run in parallel to a remove operation. This
>>> can lead to a condition where the interrupt handler is processing with
>>> memory that has been freed.
>>>
>>> To avoid processing an interrupt while memory may be yanked, check for
>>> removal while in the interrupt handler. Bail when removal is imminent.
>>>
>>> Signed-off-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
>>> Signed-off-by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>
>>> ---
>>> drivers/scsi/cxlflash/common.h |  2 ++
>>> drivers/scsi/cxlflash/main.c   | 21 +++++++++++++++------
>>> 2 files changed, 17 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/scsi/cxlflash/common.h b/drivers/scsi/cxlflash/common.h
>>> index 1abe4e0..03d2cc6 100644
>>> --- a/drivers/scsi/cxlflash/common.h
>>> +++ b/drivers/scsi/cxlflash/common.h
>>> @@ -103,6 +103,8 @@ struct cxlflash_cfg {
>>> 	enum cxlflash_lr_state lr_state;
>>> 	int lr_port;
>>>
>>> +	atomic_t remove_active;
>>> +
>>> 	struct cxl_afu *cxl_afu;
>>>
>>> 	struct pci_pool *cxlflash_cmd_pool;
>>> diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
>>> index 6e85c77..89ee648 100644
>>> --- a/drivers/scsi/cxlflash/main.c
>>> +++ b/drivers/scsi/cxlflash/main.c
>>> @@ -892,6 +892,7 @@ static void cxlflash_remove(struct pci_dev *pdev)
>>> 	spin_unlock_irqrestore(&cfg->tmf_waitq.lock, lock_flags);
>>>
>>> 	cfg->state = STATE_FAILTERM;
>>> +	atomic_inc(&cfg->remove_active);
>> Hi Matthew,
>> you could just call term_afu at this point, this way you don't
>> need an additional check in all irq functions.
>> Cheers,
>> Tomas
> Hi Tomas,
>
> We actually do call term_afu() a few lines down from here. I don't follow
> how moving it here would help things.

When you disable ints sooner (that is what term_afu does ?) you'll get no
more ints later isn't this what you want?

>
> The reason for the atomic was to provide something lightweight that we
> could check _inside_ the processing loop for the read-response queue
> handler. A check outside that loop doesn't really provide much in terms
> of closing or narrowing down the window of when freed memory can be
> accessed.
>
> As David Laight correctly pointed out, this approach does not completely
> close the window. We'd need something heavier to fully protect (e.g. a lock
> to wrap around the entire loop). I will look into adding this in a future cycle
> when I can adequately test.

term_afu calls free_irq and this function
does not return until any executing interrupts for have completed.
This is the sync mechanism you need, it's lightweight
(does not add an additional check to your irq functions)
and closes the race window completely.

--tm

>
>
> -matt
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 09/30] cxlflash: Fix to stop interrupt processing on remove
  2015-09-18 11:59     ` Tomas Henzl
@ 2015-09-18 23:26       ` Matthew R. Ochs
  2015-09-21 11:33         ` Tomas Henzl
  0 siblings, 1 reply; 10+ messages in thread
From: Matthew R. Ochs @ 2015-09-18 23:26 UTC (permalink / raw)
  To: Tomas Henzl
  Cc: linux-scsi, James.Bottomley, nab, brking, imunsie, dja,
	andrew.donnellan, mikey, linuxppc-dev, Manoj N. Kumar

> On Sep 18, 2015, at 6:59 AM, Tomas Henzl <thenzl@redhat.com> wrote:
> On 17.9.2015 19:16, Matthew R. Ochs wrote:
>>> On Sep 17, 2015, at 7:38 AM, Tomas Henzl <thenzl@redhat.com> wrote:
>>>=20
>>> On 16.9.2015 18:53, Matthew R. Ochs wrote:
>>>> Interrupt processing can run in parallel to a remove operation. =
This
>>>> can lead to a condition where the interrupt handler is processing =
with
>>>> memory that has been freed.
>>>>=20
>>>> To avoid processing an interrupt while memory may be yanked, check =
for
>>>> removal while in the interrupt handler. Bail when removal is =
imminent.
>>>>=20
>>>> Signed-off-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
>>>> Signed-off-by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>
>>>> ---
>>>> drivers/scsi/cxlflash/common.h |  2 ++
>>>> drivers/scsi/cxlflash/main.c   | 21 +++++++++++++++------
>>>> 2 files changed, 17 insertions(+), 6 deletions(-)
>>>>=20
>>>> diff --git a/drivers/scsi/cxlflash/common.h =
b/drivers/scsi/cxlflash/common.h
>>>> index 1abe4e0..03d2cc6 100644
>>>> --- a/drivers/scsi/cxlflash/common.h
>>>> +++ b/drivers/scsi/cxlflash/common.h
>>>> @@ -103,6 +103,8 @@ struct cxlflash_cfg {
>>>> 	enum cxlflash_lr_state lr_state;
>>>> 	int lr_port;
>>>>=20
>>>> +	atomic_t remove_active;
>>>> +
>>>> 	struct cxl_afu *cxl_afu;
>>>>=20
>>>> 	struct pci_pool *cxlflash_cmd_pool;
>>>> diff --git a/drivers/scsi/cxlflash/main.c =
b/drivers/scsi/cxlflash/main.c
>>>> index 6e85c77..89ee648 100644
>>>> --- a/drivers/scsi/cxlflash/main.c
>>>> +++ b/drivers/scsi/cxlflash/main.c
>>>> @@ -892,6 +892,7 @@ static void cxlflash_remove(struct pci_dev =
*pdev)
>>>> 	spin_unlock_irqrestore(&cfg->tmf_waitq.lock, lock_flags);
>>>>=20
>>>> 	cfg->state =3D STATE_FAILTERM;
>>>> +	atomic_inc(&cfg->remove_active);
>>> Hi Matthew,
>>> you could just call term_afu at this point, this way you don't
>>> need an additional check in all irq functions.
>>> Cheers,
>>> Tomas
>> Hi Tomas,
>>=20
>> We actually do call term_afu() a few lines down from here. I don't =
follow
>> how moving it here would help things.
>=20
> When you disable ints sooner (that is what term_afu does ?) you'll get =
no
> more ints later isn't this what you want?

Correct, that's what we want.

>> The reason for the atomic was to provide something lightweight that =
we
>> could check _inside_ the processing loop for the read-response queue
>> handler. A check outside that loop doesn't really provide much in =
terms
>> of closing or narrowing down the window of when freed memory can be
>> accessed.
>>=20
>> As David Laight correctly pointed out, this approach does not =
completely
>> close the window. We'd need something heavier to fully protect (e.g. =
a lock
>> to wrap around the entire loop). I will look into adding this in a =
future cycle
>> when I can adequately test.
>=20
> term_afu calls free_irq and this function
> does not return until any executing interrupts for have completed.
> This is the sync mechanism you need, it's lightweight
> (does not add an additional check to your irq functions)
> and closes the race window completely.

Thanks for clarifying!

I looked at this closer and you are correct, free_irq() guarantees not
to return until the interrupt handler has completed. The current =
location
of term_afu() is appropriate as the memory that the handler touches is
not freed until the very end [by free_mem() and scsi_host_put()] of the
remove. Thus we can simply ignore this patch (I'll remove it in a v3).


-matt

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 09/30] cxlflash: Fix to stop interrupt processing on remove
  2015-09-18 23:26       ` Matthew R. Ochs
@ 2015-09-21 11:33         ` Tomas Henzl
  2015-09-21 21:58           ` Matthew R. Ochs
  0 siblings, 1 reply; 10+ messages in thread
From: Tomas Henzl @ 2015-09-21 11:33 UTC (permalink / raw)
  To: Matthew R. Ochs
  Cc: linux-scsi, James.Bottomley, nab, brking, imunsie, dja,
	andrew.donnellan, mikey, linuxppc-dev, Manoj N. Kumar

On 19.9.2015 01:26, Matthew R. Ochs wrote:
>> On Sep 18, 2015, at 6:59 AM, Tomas Henzl <thenzl@redhat.com> wrote:
>> On 17.9.2015 19:16, Matthew R. Ochs wrote:
>>>> On Sep 17, 2015, at 7:38 AM, Tomas Henzl <thenzl@redhat.com> wrote:
>>>>
>>>> On 16.9.2015 18:53, Matthew R. Ochs wrote:
>>>>> Interrupt processing can run in parallel to a remove operation. This
>>>>> can lead to a condition where the interrupt handler is processing with
>>>>> memory that has been freed.
>>>>>
>>>>> To avoid processing an interrupt while memory may be yanked, check for
>>>>> removal while in the interrupt handler. Bail when removal is imminent.
>>>>>
>>>>> Signed-off-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
>>>>> Signed-off-by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>
>>>>> ---
>>>>> drivers/scsi/cxlflash/common.h |  2 ++
>>>>> drivers/scsi/cxlflash/main.c   | 21 +++++++++++++++------
>>>>> 2 files changed, 17 insertions(+), 6 deletions(-)
>>>>>
>>>>> diff --git a/drivers/scsi/cxlflash/common.h b/drivers/scsi/cxlflash/common.h
>>>>> index 1abe4e0..03d2cc6 100644
>>>>> --- a/drivers/scsi/cxlflash/common.h
>>>>> +++ b/drivers/scsi/cxlflash/common.h
>>>>> @@ -103,6 +103,8 @@ struct cxlflash_cfg {
>>>>> 	enum cxlflash_lr_state lr_state;
>>>>> 	int lr_port;
>>>>>
>>>>> +	atomic_t remove_active;
>>>>> +
>>>>> 	struct cxl_afu *cxl_afu;
>>>>>
>>>>> 	struct pci_pool *cxlflash_cmd_pool;
>>>>> diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
>>>>> index 6e85c77..89ee648 100644
>>>>> --- a/drivers/scsi/cxlflash/main.c
>>>>> +++ b/drivers/scsi/cxlflash/main.c
>>>>> @@ -892,6 +892,7 @@ static void cxlflash_remove(struct pci_dev *pdev)
>>>>> 	spin_unlock_irqrestore(&cfg->tmf_waitq.lock, lock_flags);
>>>>>
>>>>> 	cfg->state = STATE_FAILTERM;
>>>>> +	atomic_inc(&cfg->remove_active);
>>>> Hi Matthew,
>>>> you could just call term_afu at this point, this way you don't
>>>> need an additional check in all irq functions.
>>>> Cheers,
>>>> Tomas
>>> Hi Tomas,
>>>
>>> We actually do call term_afu() a few lines down from here. I don't follow
>>> how moving it here would help things.
>> When you disable ints sooner (that is what term_afu does ?) you'll get no
>> more ints later isn't this what you want?
> Correct, that's what we want.
>
>>> The reason for the atomic was to provide something lightweight that we
>>> could check _inside_ the processing loop for the read-response queue
>>> handler. A check outside that loop doesn't really provide much in terms
>>> of closing or narrowing down the window of when freed memory can be
>>> accessed.
>>>
>>> As David Laight correctly pointed out, this approach does not completely
>>> close the window. We'd need something heavier to fully protect (e.g. a lock
>>> to wrap around the entire loop). I will look into adding this in a future cycle
>>> when I can adequately test.
>> term_afu calls free_irq and this function
>> does not return until any executing interrupts for have completed.
>> This is the sync mechanism you need, it's lightweight
>> (does not add an additional check to your irq functions)
>> and closes the race window completely.
> Thanks for clarifying!
>
> I looked at this closer and you are correct, free_irq() guarantees not
> to return until the interrupt handler has completed. The current location
> of term_afu() is appropriate as the memory that the handler touches is
> not freed until the very end [by free_mem() and scsi_host_put()] of the
> remove. Thus we can simply ignore this patch (I'll remove it in a v3).

OK. In some future patch please reorganize the remove function so,
that it follows the template described in Documentation/PCI/pci.txt :
	Disable the device from generating IRQs
	Release the IRQ (free_irq())
	Stop all DMA activity
	Release DMA buffers (both streaming and coherent)
	Unregister from other subsystems (e.g. scsi or netdev)
	Release MMIO/IOP resources
	Disable the device

>
>
> -matt
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 09/30] cxlflash: Fix to stop interrupt processing on remove
  2015-09-21 11:33         ` Tomas Henzl
@ 2015-09-21 21:58           ` Matthew R. Ochs
  0 siblings, 0 replies; 10+ messages in thread
From: Matthew R. Ochs @ 2015-09-21 21:58 UTC (permalink / raw)
  To: Tomas Henzl
  Cc: linux-scsi, James.Bottomley, nab, brking, imunsie, dja,
	andrew.donnellan, mikey, linuxppc-dev, Manoj N. Kumar

> On Sep 21, 2015, at 6:33 AM, Tomas Henzl <thenzl@redhat.com> wrote:
> On 19.9.2015 01:26, Matthew R. Ochs wrote:
>>> On Sep 18, 2015, at 6:59 AM, Tomas Henzl <thenzl@redhat.com> wrote:
>>> On 17.9.2015 19:16, Matthew R. Ochs wrote:
>>>>> On Sep 17, 2015, at 7:38 AM, Tomas Henzl <thenzl@redhat.com> =
wrote:
>>>>>=20
>>>>> On 16.9.2015 18:53, Matthew R. Ochs wrote:
>>>>>> Interrupt processing can run in parallel to a remove operation. =
This
>>>>>> can lead to a condition where the interrupt handler is processing =
with
>>>>>> memory that has been freed.
>>>>>>=20
>>>>>> To avoid processing an interrupt while memory may be yanked, =
check for
>>>>>> removal while in the interrupt handler. Bail when removal is =
imminent.
>>>>>>=20
>>>>>> Signed-off-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
>>>>>> Signed-off-by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>
>>>>>> ---
>>>>>> drivers/scsi/cxlflash/common.h |  2 ++
>>>>>> drivers/scsi/cxlflash/main.c   | 21 +++++++++++++++------
>>>>>> 2 files changed, 17 insertions(+), 6 deletions(-)
>>>>>>=20
>>>>>> diff --git a/drivers/scsi/cxlflash/common.h =
b/drivers/scsi/cxlflash/common.h
>>>>>> index 1abe4e0..03d2cc6 100644
>>>>>> --- a/drivers/scsi/cxlflash/common.h
>>>>>> +++ b/drivers/scsi/cxlflash/common.h
>>>>>> @@ -103,6 +103,8 @@ struct cxlflash_cfg {
>>>>>> 	enum cxlflash_lr_state lr_state;
>>>>>> 	int lr_port;
>>>>>>=20
>>>>>> +	atomic_t remove_active;
>>>>>> +
>>>>>> 	struct cxl_afu *cxl_afu;
>>>>>>=20
>>>>>> 	struct pci_pool *cxlflash_cmd_pool;
>>>>>> diff --git a/drivers/scsi/cxlflash/main.c =
b/drivers/scsi/cxlflash/main.c
>>>>>> index 6e85c77..89ee648 100644
>>>>>> --- a/drivers/scsi/cxlflash/main.c
>>>>>> +++ b/drivers/scsi/cxlflash/main.c
>>>>>> @@ -892,6 +892,7 @@ static void cxlflash_remove(struct pci_dev =
*pdev)
>>>>>> 	spin_unlock_irqrestore(&cfg->tmf_waitq.lock, lock_flags);
>>>>>>=20
>>>>>> 	cfg->state =3D STATE_FAILTERM;
>>>>>> +	atomic_inc(&cfg->remove_active);
>>>>> Hi Matthew,
>>>>> you could just call term_afu at this point, this way you don't
>>>>> need an additional check in all irq functions.
>>>>> Cheers,
>>>>> Tomas
>>>> Hi Tomas,
>>>>=20
>>>> We actually do call term_afu() a few lines down from here. I don't =
follow
>>>> how moving it here would help things.
>>> When you disable ints sooner (that is what term_afu does ?) you'll =
get no
>>> more ints later isn't this what you want?
>> Correct, that's what we want.
>>=20
>>>> The reason for the atomic was to provide something lightweight that =
we
>>>> could check _inside_ the processing loop for the read-response =
queue
>>>> handler. A check outside that loop doesn't really provide much in =
terms
>>>> of closing or narrowing down the window of when freed memory can be
>>>> accessed.
>>>>=20
>>>> As David Laight correctly pointed out, this approach does not =
completely
>>>> close the window. We'd need something heavier to fully protect =
(e.g. a lock
>>>> to wrap around the entire loop). I will look into adding this in a =
future cycle
>>>> when I can adequately test.
>>> term_afu calls free_irq and this function
>>> does not return until any executing interrupts for have completed.
>>> This is the sync mechanism you need, it's lightweight
>>> (does not add an additional check to your irq functions)
>>> and closes the race window completely.
>> Thanks for clarifying!
>>=20
>> I looked at this closer and you are correct, free_irq() guarantees =
not
>> to return until the interrupt handler has completed. The current =
location
>> of term_afu() is appropriate as the memory that the handler touches =
is
>> not freed until the very end [by free_mem() and scsi_host_put()] of =
the
>> remove. Thus we can simply ignore this patch (I'll remove it in a =
v3).
>=20
> OK. In some future patch please reorganize the remove function so,
> that it follows the template described in Documentation/PCI/pci.txt :
> 	Disable the device from generating IRQs
> 	Release the IRQ (free_irq())
> 	Stop all DMA activity
> 	Release DMA buffers (both streaming and coherent)
> 	Unregister from other subsystems (e.g. scsi or netdev)
> 	Release MMIO/IOP resources
> 	Disable the device

Will do.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 09/30] cxlflash: Fix to stop interrupt processing on remove
  2015-09-17 11:58   ` David Laight
@ 2015-09-17 16:55     ` Matthew R. Ochs
  0 siblings, 0 replies; 10+ messages in thread
From: Matthew R. Ochs @ 2015-09-17 16:55 UTC (permalink / raw)
  To: David Laight
  Cc: linux-scsi, James Bottomley, Nicholas A. Bellinger, Brian King,
	Ian Munsie, Daniel Axtens, Andrew Donnellan, Michael Neuling,
	Manoj N. Kumar, linuxppc-dev

> On Sep 17, 2015, at 6:58 AM, David Laight <David.Laight@ACULAB.COM> =
wrote:
>=20
> From: Linuxppc-dev Matthew R. Ochs
>> Sent: 16 September 2015 22:28
>> Interrupt processing can run in parallel to a remove operation. This
>> can lead to a condition where the interrupt handler is processing =
with
>> memory that has been freed.
>>=20
>> To avoid processing an interrupt while memory may be yanked, check =
for
>> removal while in the interrupt handler. Bail when removal is =
imminent.
>=20
> On the face of it this just reduces the size of the window somewhat.

Agreed.

>=20
> What happens if the interrupt routine reads the flag just before it is =
set
> (so is processing the entry that is being removed) and is then (say)
> interrupted by a higher priority interrupt that takes longer to =
execute than
> the remove code?

Understood. To completely close we'd need to either introduce a lock or =
a
reciprocal flag/count such that the remove doesn't make forward progress
until after interrupt processing has completed. I can look at =
introducing such
a mechanism in a later patch to fully remove the exposure.


-matt

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH v2 09/30] cxlflash: Fix to stop interrupt processing on remove
  2015-09-16 21:28 ` [PATCH v2 09/30] cxlflash: Fix to stop interrupt processing on remove Matthew R. Ochs
@ 2015-09-17 11:58   ` David Laight
  2015-09-17 16:55     ` Matthew R. Ochs
  0 siblings, 1 reply; 10+ messages in thread
From: David Laight @ 2015-09-17 11:58 UTC (permalink / raw)
  To: 'Matthew R. Ochs',
	linux-scsi, James Bottomley, Nicholas A. Bellinger, Brian King,
	Ian Munsie, Daniel Axtens, Andrew Donnellan
  Cc: Michael Neuling, Manoj N. Kumar, linuxppc-dev

RnJvbTogTGludXhwcGMtZGV2IE1hdHRoZXcgUi4gT2Nocw0KPiBTZW50OiAxNiBTZXB0ZW1iZXIg
MjAxNSAyMjoyOA0KPiBJbnRlcnJ1cHQgcHJvY2Vzc2luZyBjYW4gcnVuIGluIHBhcmFsbGVsIHRv
IGEgcmVtb3ZlIG9wZXJhdGlvbi4gVGhpcw0KPiBjYW4gbGVhZCB0byBhIGNvbmRpdGlvbiB3aGVy
ZSB0aGUgaW50ZXJydXB0IGhhbmRsZXIgaXMgcHJvY2Vzc2luZyB3aXRoDQo+IG1lbW9yeSB0aGF0
IGhhcyBiZWVuIGZyZWVkLg0KPiANCj4gVG8gYXZvaWQgcHJvY2Vzc2luZyBhbiBpbnRlcnJ1cHQg
d2hpbGUgbWVtb3J5IG1heSBiZSB5YW5rZWQsIGNoZWNrIGZvcg0KPiByZW1vdmFsIHdoaWxlIGlu
IHRoZSBpbnRlcnJ1cHQgaGFuZGxlci4gQmFpbCB3aGVuIHJlbW92YWwgaXMgaW1taW5lbnQuDQoN
Ck9uIHRoZSBmYWNlIG9mIGl0IHRoaXMganVzdCByZWR1Y2VzIHRoZSBzaXplIG9mIHRoZSB3aW5k
b3cgc29tZXdoYXQuDQoNCldoYXQgaGFwcGVucyBpZiB0aGUgaW50ZXJydXB0IHJvdXRpbmUgcmVh
ZHMgdGhlIGZsYWcganVzdCBiZWZvcmUgaXQgaXMgc2V0DQooc28gaXMgcHJvY2Vzc2luZyB0aGUg
ZW50cnkgdGhhdCBpcyBiZWluZyByZW1vdmVkKSBhbmQgaXMgdGhlbiAoc2F5KQ0KaW50ZXJydXB0
ZWQgYnkgYSBoaWdoZXIgcHJpb3JpdHkgaW50ZXJydXB0IHRoYXQgdGFrZXMgbG9uZ2VyIHRvIGV4
ZWN1dGUgdGhhbg0KdGhlIHJlbW92ZSBjb2RlPw0KDQpZb3UndmUgc3RpbGwgZ290IGFuIGludGVy
cnVwdCByb3V0aW5lIGFjY2Vzc2luZyBmcmVlZCBtZW1vcnkuDQoNCglEYXZpZA0KDQo=

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2 09/30] cxlflash: Fix to stop interrupt processing on remove
  2015-09-16 21:23 [PATCH v2 00/30] cxlflash: Miscellaneous bug fixes and corrections Matthew R. Ochs
@ 2015-09-16 21:28 ` Matthew R. Ochs
  2015-09-17 11:58   ` David Laight
  0 siblings, 1 reply; 10+ messages in thread
From: Matthew R. Ochs @ 2015-09-16 21:28 UTC (permalink / raw)
  To: linux-scsi, James Bottomley, Nicholas A. Bellinger, Brian King,
	Ian Munsie, Daniel Axtens, Andrew Donnellan
  Cc: Michael Neuling, linuxppc-dev, Manoj N. Kumar

Interrupt processing can run in parallel to a remove operation. This
can lead to a condition where the interrupt handler is processing with
memory that has been freed.
    
To avoid processing an interrupt while memory may be yanked, check for
removal while in the interrupt handler. Bail when removal is imminent.

Signed-off-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
Signed-off-by: Manoj N. Kumar <manoj@linux.vnet.ibm.com>
---
 drivers/scsi/cxlflash/common.h |  2 ++
 drivers/scsi/cxlflash/main.c   | 21 +++++++++++++++------
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/drivers/scsi/cxlflash/common.h b/drivers/scsi/cxlflash/common.h
index 1abe4e0..03d2cc6 100644
--- a/drivers/scsi/cxlflash/common.h
+++ b/drivers/scsi/cxlflash/common.h
@@ -103,6 +103,8 @@ struct cxlflash_cfg {
 	enum cxlflash_lr_state lr_state;
 	int lr_port;
 
+	atomic_t remove_active;
+
 	struct cxl_afu *cxl_afu;
 
 	struct pci_pool *cxlflash_cmd_pool;
diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
index 6e85c77..89ee648 100644
--- a/drivers/scsi/cxlflash/main.c
+++ b/drivers/scsi/cxlflash/main.c
@@ -892,6 +892,7 @@ static void cxlflash_remove(struct pci_dev *pdev)
 	spin_unlock_irqrestore(&cfg->tmf_waitq.lock, lock_flags);
 
 	cfg->state = STATE_FAILTERM;
+	atomic_inc(&cfg->remove_active);
 	cxlflash_stop_term_user_contexts(cfg);
 
 	switch (cfg->init_state) {
@@ -1380,16 +1381,20 @@ static void afu_err_intr_init(struct afu *afu)
 static irqreturn_t cxlflash_sync_err_irq(int irq, void *data)
 {
 	struct afu *afu = (struct afu *)data;
+	struct cxlflash_cfg *cfg = afu->parent;
 	u64 reg;
 	u64 reg_unmasked;
 
+	if (atomic_read(&cfg->remove_active))
+		goto out;
+
 	reg = readq_be(&afu->host_map->intr_status);
 	reg_unmasked = (reg & SISL_ISTATUS_UNMASK);
 
 	if (reg_unmasked == 0UL) {
 		pr_err("%s: %llX: spurious interrupt, intr_status %016llX\n",
 		       __func__, (u64)afu, reg);
-		goto cxlflash_sync_err_irq_exit;
+		goto out;
 	}
 
 	pr_err("%s: %llX: unexpected interrupt, intr_status %016llX\n",
@@ -1397,7 +1402,7 @@ static irqreturn_t cxlflash_sync_err_irq(int irq, void *data)
 
 	writeq_be(reg_unmasked, &afu->host_map->intr_clear);
 
-cxlflash_sync_err_irq_exit:
+out:
 	pr_debug("%s: returning rc=%d\n", __func__, IRQ_HANDLED);
 	return IRQ_HANDLED;
 }
@@ -1412,6 +1417,7 @@ cxlflash_sync_err_irq_exit:
 static irqreturn_t cxlflash_rrq_irq(int irq, void *data)
 {
 	struct afu *afu = (struct afu *)data;
+	struct cxlflash_cfg *cfg = afu->parent;
 	struct afu_cmd *cmd;
 	bool toggle = afu->toggle;
 	u64 entry,
@@ -1421,8 +1427,10 @@ static irqreturn_t cxlflash_rrq_irq(int irq, void *data)
 
 	/* Process however many RRQ entries that are ready */
 	while (true) {
-		entry = *hrrq_curr;
+		if (atomic_read(&cfg->remove_active))
+			goto out;
 
+		entry = *hrrq_curr;
 		if ((entry & SISL_RESP_HANDLE_T_BIT) != toggle)
 			break;
 
@@ -1440,7 +1448,7 @@ static irqreturn_t cxlflash_rrq_irq(int irq, void *data)
 
 	afu->hrrq_curr = hrrq_curr;
 	afu->toggle = toggle;
-
+out:
 	return IRQ_HANDLED;
 }
 
@@ -1454,7 +1462,7 @@ static irqreturn_t cxlflash_rrq_irq(int irq, void *data)
 static irqreturn_t cxlflash_async_err_irq(int irq, void *data)
 {
 	struct afu *afu = (struct afu *)data;
-	struct cxlflash_cfg *cfg;
+	struct cxlflash_cfg *cfg = afu->parent;
 	u64 reg_unmasked;
 	const struct asyc_intr_info *info;
 	struct sisl_global_map *global = &afu->afu_map->global;
@@ -1462,7 +1470,8 @@ static irqreturn_t cxlflash_async_err_irq(int irq, void *data)
 	u8 port;
 	int i;
 
-	cfg = afu->parent;
+	if (atomic_read(&cfg->remove_active))
+		goto out;
 
 	reg = readq_be(&global->regs.aintr_status);
 	reg_unmasked = (reg & SISL_ASTATUS_UNMASK);
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2015-09-21 21:58 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-16 16:53 [PATCH v2 09/30] cxlflash: Fix to stop interrupt processing on remove Matthew R. Ochs
2015-09-17 12:38 ` Tomas Henzl
2015-09-17 17:16   ` Matthew R. Ochs
2015-09-18 11:59     ` Tomas Henzl
2015-09-18 23:26       ` Matthew R. Ochs
2015-09-21 11:33         ` Tomas Henzl
2015-09-21 21:58           ` Matthew R. Ochs
2015-09-16 21:23 [PATCH v2 00/30] cxlflash: Miscellaneous bug fixes and corrections Matthew R. Ochs
2015-09-16 21:28 ` [PATCH v2 09/30] cxlflash: Fix to stop interrupt processing on remove Matthew R. Ochs
2015-09-17 11:58   ` David Laight
2015-09-17 16:55     ` Matthew R. Ochs

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).