All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] ntb_hw_switchtec: Force down the link before initializing
@ 2017-12-04 17:57 Logan Gunthorpe
  2017-12-05  1:47 ` ThanhTuThai
  2017-12-05 19:30 ` Jon Mason
  0 siblings, 2 replies; 9+ messages in thread
From: Logan Gunthorpe @ 2017-12-04 17:57 UTC (permalink / raw)
  To: linux-ntb; +Cc: ThanhTuThai, Logan Gunthorpe

If one host crashes and soft reboots, the other host may not see a
link down event. Then when the crashed host comes back up, the
surviving host may not know the link was reset and the NTB clients
may not work without being reset.

To solve this, we send a LINK_FORCE_DOWN message to each peer every
time we come up, before we register the NTB device. If a surviving
host still thinks the link is up it will take it down immediately.
In this way, once the crashed host comes up fully, it will send a
regular link up event as per usual and the link will be properly
restarted.

While we are in the area, this also fixes the MSG_LINK_UP message that
was in the link down function that was reported by Doug Meyers.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reported-by: ThanhTuThai <cruisethai@gmail.com>
---

I had a small epiphany over the weekend as to why the link still was
not coming up after a hard reboot. I think this will fix it.

@Thai Thu: Can you please retest?

Changes in v2:
 * Reinitialize the shared memory window on a forced link down

 drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 57 +++++++++++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 7 deletions(-)

diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
index 709f37fbe232..6bc213cf7bf6 100644
--- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
+++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
@@ -118,6 +118,7 @@ struct switchtec_ntb {
 	bool link_is_up;
 	enum ntb_speed link_speed;
 	enum ntb_width link_width;
+	struct work_struct link_reinit_work;
 };

 static struct switchtec_ntb *ntb_sndev(struct ntb_dev *ntb)
@@ -463,18 +464,43 @@ static void switchtec_ntb_set_link_speed(struct switchtec_ntb *sndev)
 	sndev->link_width = min(self_width, peer_width);
 }

-enum {
+enum switchtec_msg {
 	LINK_MESSAGE = 0,
 	MSG_LINK_UP = 1,
 	MSG_LINK_DOWN = 2,
 	MSG_CHECK_LINK = 3,
+	MSG_LINK_FORCE_DOWN = 4,
 };

-static void switchtec_ntb_check_link(struct switchtec_ntb *sndev)
+static int switchtec_ntb_reinit_peer(struct switchtec_ntb *sndev);
+
+static void link_reinit_work(struct work_struct *work)
+{
+	struct switchtec_ntb *sndev;
+
+	sndev = container_of(work, struct switchtec_ntb, link_reinit_work);
+
+	switchtec_ntb_reinit_peer(sndev);
+}
+
+static void switchtec_ntb_check_link(struct switchtec_ntb *sndev,
+				     enum switchtec_msg msg)
 {
 	int link_sta;
 	int old = sndev->link_is_up;

+	if (msg == MSG_LINK_FORCE_DOWN) {
+		schedule_work(&sndev->link_reinit_work);
+
+		if (sndev->link_is_up) {
+			sndev->link_is_up = 0;
+			ntb_link_event(&sndev->ntb);
+			dev_info(&sndev->stdev->dev, "ntb link forced down\n");
+		}
+
+		return;
+	}
+
 	link_sta = sndev->self_shared->link_sta;
 	if (link_sta) {
 		u64 peer = ioread64(&sndev->peer_shared->magic);
@@ -500,7 +526,7 @@ static void switchtec_ntb_link_notification(struct switchtec_dev *stdev)
 {
 	struct switchtec_ntb *sndev = stdev->sndev;

-	switchtec_ntb_check_link(sndev);
+	switchtec_ntb_check_link(sndev, MSG_CHECK_LINK);
 }

 static u64 switchtec_ntb_link_is_up(struct ntb_dev *ntb,
@@ -528,7 +554,7 @@ static int switchtec_ntb_link_enable(struct ntb_dev *ntb,
 	sndev->self_shared->link_sta = 1;
 	switchtec_ntb_send_msg(sndev, LINK_MESSAGE, MSG_LINK_UP);

-	switchtec_ntb_check_link(sndev);
+	switchtec_ntb_check_link(sndev, MSG_CHECK_LINK);

 	return 0;
 }
@@ -540,9 +566,9 @@ static int switchtec_ntb_link_disable(struct ntb_dev *ntb)
 	dev_dbg(&sndev->stdev->dev, "disabling link\n");

 	sndev->self_shared->link_sta = 0;
-	switchtec_ntb_send_msg(sndev, LINK_MESSAGE, MSG_LINK_UP);
+	switchtec_ntb_send_msg(sndev, LINK_MESSAGE, MSG_LINK_DOWN);

-	switchtec_ntb_check_link(sndev);
+	switchtec_ntb_check_link(sndev, MSG_CHECK_LINK);

 	return 0;
 }
@@ -785,6 +811,8 @@ static void switchtec_ntb_init_sndev(struct switchtec_ntb *sndev)
 	sndev->ntb.topo = NTB_TOPO_SWITCH;
 	sndev->ntb.ops = &switchtec_ntb_ops;

+	INIT_WORK(&sndev->link_reinit_work, link_reinit_work);
+
 	sndev->self_partition = sndev->stdev->partition;

 	sndev->mmio_ntb = sndev->stdev->mmio_ntb;
@@ -1062,7 +1090,7 @@ static irqreturn_t switchtec_ntb_message_isr(int irq, void *dev)
 			iowrite8(1, &sndev->mmio_self_dbmsg->imsg[i].status);

 			if (i == LINK_MESSAGE)
-				switchtec_ntb_check_link(sndev);
+				switchtec_ntb_check_link(sndev, msg);
 		}
 	}

@@ -1123,6 +1151,14 @@ static void switchtec_ntb_deinit_db_msg_irq(struct switchtec_ntb *sndev)
 	free_irq(sndev->message_irq, sndev);
 }

+static int switchtec_ntb_reinit_peer(struct switchtec_ntb *sndev)
+{
+	dev_info(&sndev->stdev->dev, "peer reinitialized\n");
+	switchtec_ntb_deinit_shared_mw(sndev);
+	switchtec_ntb_init_mw(sndev);
+	return switchtec_ntb_init_shared_mw(sndev);
+}
+
 static int switchtec_ntb_add(struct device *dev,
 			     struct class_interface *class_intf)
 {
@@ -1160,6 +1196,13 @@ static int switchtec_ntb_add(struct device *dev,
 	if (rc)
 		goto deinit_shared_and_exit;

+	/*
+	 * If this host crashed, the other host may think the link is
+	 * still up. Tell them to force it down (it will go back up
+	 * once we register the ntb device).
+	 */
+	switchtec_ntb_send_msg(sndev, LINK_MESSAGE, MSG_LINK_FORCE_DOWN);
+
 	rc = ntb_register_device(&sndev->ntb);
 	if (rc)
 		goto deinit_and_exit;
--
2.11.0

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] ntb_hw_switchtec: Force down the link before initializing
  2017-12-04 17:57 [PATCH v2] ntb_hw_switchtec: Force down the link before initializing Logan Gunthorpe
@ 2017-12-05  1:47 ` ThanhTuThai
  2017-12-05  3:00   ` Logan Gunthorpe
  2017-12-05 19:30 ` Jon Mason
  1 sibling, 1 reply; 9+ messages in thread
From: ThanhTuThai @ 2017-12-05  1:47 UTC (permalink / raw)
  To: Logan Gunthorpe; +Cc: linux-ntb

[-- Attachment #1: Type: text/plain, Size: 6573 bytes --]

Dear Logan,

The problem is that the good host even didn't receive any message when the
crashed peer come up, so your  "re-init scheduled work" will not be
activated.
Because the good peer didn't receive any interrupt to activate the
scheduled work.

Thank you !

On 5 December 2017 at 01:57, Logan Gunthorpe <logang@deltatee.com> wrote:

> If one host crashes and soft reboots, the other host may not see a
> link down event. Then when the crashed host comes back up, the
> surviving host may not know the link was reset and the NTB clients
> may not work without being reset.
>
> To solve this, we send a LINK_FORCE_DOWN message to each peer every
> time we come up, before we register the NTB device. If a surviving
> host still thinks the link is up it will take it down immediately.
> In this way, once the crashed host comes up fully, it will send a
> regular link up event as per usual and the link will be properly
> restarted.
>
> While we are in the area, this also fixes the MSG_LINK_UP message that
> was in the link down function that was reported by Doug Meyers.
>
> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
> Reported-by: ThanhTuThai <cruisethai@gmail.com>
> ---
>
> I had a small epiphany over the weekend as to why the link still was
> not coming up after a hard reboot. I think this will fix it.
>
> @Thai Thu: Can you please retest?
>
> Changes in v2:
>  * Reinitialize the shared memory window on a forced link down
>
>  drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 57
> +++++++++++++++++++++++++++++-----
>  1 file changed, 50 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
> b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
> index 709f37fbe232..6bc213cf7bf6 100644
> --- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
> +++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
> @@ -118,6 +118,7 @@ struct switchtec_ntb {
>         bool link_is_up;
>         enum ntb_speed link_speed;
>         enum ntb_width link_width;
> +       struct work_struct link_reinit_work;
>  };
>
>  static struct switchtec_ntb *ntb_sndev(struct ntb_dev *ntb)
> @@ -463,18 +464,43 @@ static void switchtec_ntb_set_link_speed(struct
> switchtec_ntb *sndev)
>         sndev->link_width = min(self_width, peer_width);
>  }
>
> -enum {
> +enum switchtec_msg {
>         LINK_MESSAGE = 0,
>         MSG_LINK_UP = 1,
>         MSG_LINK_DOWN = 2,
>         MSG_CHECK_LINK = 3,
> +       MSG_LINK_FORCE_DOWN = 4,
>  };
>
> -static void switchtec_ntb_check_link(struct switchtec_ntb *sndev)
> +static int switchtec_ntb_reinit_peer(struct switchtec_ntb *sndev);
> +
> +static void link_reinit_work(struct work_struct *work)
> +{
> +       struct switchtec_ntb *sndev;
> +
> +       sndev = container_of(work, struct switchtec_ntb, link_reinit_work);
> +
> +       switchtec_ntb_reinit_peer(sndev);
> +}
> +
> +static void switchtec_ntb_check_link(struct switchtec_ntb *sndev,
> +                                    enum switchtec_msg msg)
>  {
>         int link_sta;
>         int old = sndev->link_is_up;
>
> +       if (msg == MSG_LINK_FORCE_DOWN) {
> +               schedule_work(&sndev->link_reinit_work);
> +
> +               if (sndev->link_is_up) {
> +                       sndev->link_is_up = 0;
> +                       ntb_link_event(&sndev->ntb);
> +                       dev_info(&sndev->stdev->dev, "ntb link forced
> down\n");
> +               }
> +
> +               return;
> +       }
> +
>         link_sta = sndev->self_shared->link_sta;
>         if (link_sta) {
>                 u64 peer = ioread64(&sndev->peer_shared->magic);
> @@ -500,7 +526,7 @@ static void switchtec_ntb_link_notification(struct
> switchtec_dev *stdev)
>  {
>         struct switchtec_ntb *sndev = stdev->sndev;
>
> -       switchtec_ntb_check_link(sndev);
> +       switchtec_ntb_check_link(sndev, MSG_CHECK_LINK);
>  }
>
>  static u64 switchtec_ntb_link_is_up(struct ntb_dev *ntb,
> @@ -528,7 +554,7 @@ static int switchtec_ntb_link_enable(struct ntb_dev
> *ntb,
>         sndev->self_shared->link_sta = 1;
>         switchtec_ntb_send_msg(sndev, LINK_MESSAGE, MSG_LINK_UP);
>
> -       switchtec_ntb_check_link(sndev);
> +       switchtec_ntb_check_link(sndev, MSG_CHECK_LINK);
>
>         return 0;
>  }
> @@ -540,9 +566,9 @@ static int switchtec_ntb_link_disable(struct ntb_dev
> *ntb)
>         dev_dbg(&sndev->stdev->dev, "disabling link\n");
>
>         sndev->self_shared->link_sta = 0;
> -       switchtec_ntb_send_msg(sndev, LINK_MESSAGE, MSG_LINK_UP);
> +       switchtec_ntb_send_msg(sndev, LINK_MESSAGE, MSG_LINK_DOWN);
>
> -       switchtec_ntb_check_link(sndev);
> +       switchtec_ntb_check_link(sndev, MSG_CHECK_LINK);
>
>         return 0;
>  }
> @@ -785,6 +811,8 @@ static void switchtec_ntb_init_sndev(struct
> switchtec_ntb *sndev)
>         sndev->ntb.topo = NTB_TOPO_SWITCH;
>         sndev->ntb.ops = &switchtec_ntb_ops;
>
> +       INIT_WORK(&sndev->link_reinit_work, link_reinit_work);
> +
>         sndev->self_partition = sndev->stdev->partition;
>
>         sndev->mmio_ntb = sndev->stdev->mmio_ntb;
> @@ -1062,7 +1090,7 @@ static irqreturn_t switchtec_ntb_message_isr(int
> irq, void *dev)
>                         iowrite8(1, &sndev->mmio_self_dbmsg->imsg[
> i].status);
>
>                         if (i == LINK_MESSAGE)
> -                               switchtec_ntb_check_link(sndev);
> +                               switchtec_ntb_check_link(sndev, msg);
>                 }
>         }
>
> @@ -1123,6 +1151,14 @@ static void switchtec_ntb_deinit_db_msg_irq(struct
> switchtec_ntb *sndev)
>         free_irq(sndev->message_irq, sndev);
>  }
>
> +static int switchtec_ntb_reinit_peer(struct switchtec_ntb *sndev)
> +{
> +       dev_info(&sndev->stdev->dev, "peer reinitialized\n");
> +       switchtec_ntb_deinit_shared_mw(sndev);
> +       switchtec_ntb_init_mw(sndev);
> +       return switchtec_ntb_init_shared_mw(sndev);
> +}
> +
>  static int switchtec_ntb_add(struct device *dev,
>                              struct class_interface *class_intf)
>  {
> @@ -1160,6 +1196,13 @@ static int switchtec_ntb_add(struct device *dev,
>         if (rc)
>                 goto deinit_shared_and_exit;
>
> +       /*
> +        * If this host crashed, the other host may think the link is
> +        * still up. Tell them to force it down (it will go back up
> +        * once we register the ntb device).
> +        */
> +       switchtec_ntb_send_msg(sndev, LINK_MESSAGE, MSG_LINK_FORCE_DOWN);
> +
>         rc = ntb_register_device(&sndev->ntb);
>         if (rc)
>                 goto deinit_and_exit;
> --
> 2.11.0
>

[-- Attachment #2: Type: text/html, Size: 8298 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] ntb_hw_switchtec: Force down the link before initializing
  2017-12-05  1:47 ` ThanhTuThai
@ 2017-12-05  3:00   ` Logan Gunthorpe
  2017-12-05  3:34     ` ThanhTuThai
  0 siblings, 1 reply; 9+ messages in thread
From: Logan Gunthorpe @ 2017-12-05  3:00 UTC (permalink / raw)
  To: ThanhTuThai; +Cc: linux-ntb



On 04/12/17 06:47 PM, ThanhTuThai wrote:
> The problem is that the good host even didn't receive any message when
> the crashed peer come up, so your  "re-init scheduled work" will not be
> activated.
> Because the good peer didn't receive any interrupt to activate the
> scheduled work.

What's your evidence here? You haven't really sent me anything to go on.
Did you actually try the patch or did you just assume it doesn't work?

In any case, if your assertion is correct, then it's a firmware issue
and you'll have to contact Microsemi and get them to look at it.

Logan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] ntb_hw_switchtec: Force down the link before initializing
  2017-12-05  3:00   ` Logan Gunthorpe
@ 2017-12-05  3:34     ` ThanhTuThai
  2017-12-05  3:44       ` Logan Gunthorpe
  0 siblings, 1 reply; 9+ messages in thread
From: ThanhTuThai @ 2017-12-05  3:34 UTC (permalink / raw)
  To: Logan Gunthorpe; +Cc: linux-ntb

[-- Attachment #1: Type: text/plain, Size: 1177 bytes --]

Dear Logan,

The evidence is the "printk" in interrupt routine.
For software-reboot, the "printk" printed out perfectly.
For power-reset, it didn't, is it enough ?

I actually try your patch, and I even wrote the solution you suggested, I
also implemented it myself before, but no message received from the crashed
peer, I said that to you many times.

I didn't send you the log, b/c nothing there, it doesn't receive any
message, so it didn't log anything.

Anyway, thanks for you support !!!

On 5 December 2017 at 11:00, Logan Gunthorpe <logang@deltatee.com> wrote:

>
>
> On 04/12/17 06:47 PM, ThanhTuThai wrote:
> > The problem is that the good host even didn't receive any message when
> > the crashed peer come up, so your  "re-init scheduled work" will not be
> > activated.
> > Because the good peer didn't receive any interrupt to activate the
> > scheduled work.
>
> What's your evidence here? You haven't really sent me anything to go on.
> Did you actually try the patch or did you just assume it doesn't work?
>
> In any case, if your assertion is correct, then it's a firmware issue
> and you'll have to contact Microsemi and get them to look at it.
>
> Logan
>

[-- Attachment #2: Type: text/html, Size: 1807 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] ntb_hw_switchtec: Force down the link before initializing
  2017-12-05  3:34     ` ThanhTuThai
@ 2017-12-05  3:44       ` Logan Gunthorpe
       [not found]         ` <CAJQW-Q5p2aY2Bahx49MG24HeYvGVmW-dDBJ_aHLAqjVKGS-zpQ@mail.gmail.com>
  0 siblings, 1 reply; 9+ messages in thread
From: Logan Gunthorpe @ 2017-12-05  3:44 UTC (permalink / raw)
  To: ThanhTuThai; +Cc: linux-ntb

Everything you said is very vague. In the future you should try to be
more specific. Nothing beats actual dmesg output, even if the log
message you're referring to (I can only assume which one because you
didn't say) isn't there, then there might be other clues as to why that
is. Trying to debug based on vague statements and incomplete information
is very hard.

I had another thought that might explain what you are seeing, but
without actual debug information to confirm, I don't think I'm going to
bother trying it.

If you're confident in your diagnosis then I suggest you contact
Microsemi about the issue.

Should we merge v1 of the patch seeing it fixes the soft-reboot issue?

Logan



On 04/12/17 08:34 PM, ThanhTuThai wrote:
> Dear Logan,
> 
> The evidence is the "printk" in interrupt routine.
> For software-reboot, the "printk" printed out perfectly.
> For power-reset, it didn't, is it enough ?
> 
> I actually try your patch, and I even wrote the solution you suggested,
> I also implemented it myself before, but no message received from the
> crashed peer, I said that to you many times.
> 
> I didn't send you the log, b/c nothing there, it doesn't receive any
> message, so it didn't log anything.
> 
> Anyway, thanks for you support !!!
> 
> On 5 December 2017 at 11:00, Logan Gunthorpe <logang@deltatee.com
> <mailto:logang@deltatee.com>> wrote:
> 
> 
> 
>     On 04/12/17 06:47 PM, ThanhTuThai wrote:
>     > The problem is that the good host even didn't receive any message when
>     > the crashed peer come up, so your  "re-init scheduled work" will not be
>     > activated.
>     > Because the good peer didn't receive any interrupt to activate the
>     > scheduled work.
> 
>     What's your evidence here? You haven't really sent me anything to go on.
>     Did you actually try the patch or did you just assume it doesn't work?
> 
>     In any case, if your assertion is correct, then it's a firmware issue
>     and you'll have to contact Microsemi and get them to look at it.
> 
>     Logan
> 
> 
> -- 
> You received this message because you are subscribed to the Google
> Groups "linux-ntb" group.
> To unsubscribe from this group and stop receiving emails from it, send
> an email to linux-ntb+unsubscribe@googlegroups.com
> <mailto:linux-ntb+unsubscribe@googlegroups.com>.
> To post to this group, send email to linux-ntb@googlegroups.com
> <mailto:linux-ntb@googlegroups.com>.
> To view this discussion on the web visit
> https://groups.google.com/d/msgid/linux-ntb/CAJQW-Q733PvoghsDMiRwp7w34h%2BoXVwxr_g45Z-_Wu4%3DNQCMuA%40mail.gmail.com
> <https://groups.google.com/d/msgid/linux-ntb/CAJQW-Q733PvoghsDMiRwp7w34h%2BoXVwxr_g45Z-_Wu4%3DNQCMuA%40mail.gmail.com?utm_medium=email&utm_source=footer>.
> For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] ntb_hw_switchtec: Force down the link before initializing
       [not found]         ` <CAJQW-Q5p2aY2Bahx49MG24HeYvGVmW-dDBJ_aHLAqjVKGS-zpQ@mail.gmail.com>
@ 2017-12-05  6:20           ` Logan Gunthorpe
  2017-12-05 18:51             ` Jon Mason
  0 siblings, 1 reply; 9+ messages in thread
From: Logan Gunthorpe @ 2017-12-05  6:20 UTC (permalink / raw)
  To: Jon Mason; +Cc: ThanhTuThai, linux-ntb



On 2017-12-04 10:12 PM, ThanhTuThai wrote:
> Dear Logan,
>
> The log message I mentioned on the previous mail is dmesg.
> But I believe that it has nothing to do with the some early mail I sent,
> I already described the issue.
>
> I saw your solution for soft-reset work perfectly. So I think we can
> merge it.

@Jon, can you please merge v1 of my original patch as it seems to fix 
most of the problem. Let me know if you need me to resend it. It's 
probably worth pushing upstream before 4.15 is released too.

Thanks,

Logan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] ntb_hw_switchtec: Force down the link before initializing
  2017-12-05  6:20           ` Logan Gunthorpe
@ 2017-12-05 18:51             ` Jon Mason
  0 siblings, 0 replies; 9+ messages in thread
From: Jon Mason @ 2017-12-05 18:51 UTC (permalink / raw)
  To: Logan Gunthorpe; +Cc: ThanhTuThai, linux-ntb

On Tue, Dec 5, 2017 at 1:20 AM, Logan Gunthorpe <logang@deltatee.com> wrote:
>
>
> On 2017-12-04 10:12 PM, ThanhTuThai wrote:
>>
>> Dear Logan,
>>
>> The log message I mentioned on the previous mail is dmesg.
>> But I believe that it has nothing to do with the some early mail I sent,
>> I already described the issue.
>>
>> I saw your solution for soft-reset work perfectly. So I think we can
>> merge it.
>
>
> @Jon, can you please merge v1 of my original patch as it seems to fix most
> of the problem. Let me know if you need me to resend it. It's probably worth
> pushing upstream before 4.15 is released too.

I'll pull it into my ntb branch.

ThanhTuThai, if you can determine a way to fix the link issue, please
send a patch to the list.  If you need assistance with this, please
send full dmesg output and other details Logan requested and we'll see
what we (as a community) can do.

Thanks,
Jon

>
> Thanks,
>
> Logan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] ntb_hw_switchtec: Force down the link before initializing
  2017-12-04 17:57 [PATCH v2] ntb_hw_switchtec: Force down the link before initializing Logan Gunthorpe
  2017-12-05  1:47 ` ThanhTuThai
@ 2017-12-05 19:30 ` Jon Mason
  2017-12-05 19:46   ` Logan Gunthorpe
  1 sibling, 1 reply; 9+ messages in thread
From: Jon Mason @ 2017-12-05 19:30 UTC (permalink / raw)
  To: Logan Gunthorpe; +Cc: linux-ntb, ThanhTuThai

On Mon, Dec 4, 2017 at 12:57 PM, Logan Gunthorpe <logang@deltatee.com> wrote:
> If one host crashes and soft reboots, the other host may not see a
> link down event. Then when the crashed host comes back up, the
> surviving host may not know the link was reset and the NTB clients
> may not work without being reset.
>
> To solve this, we send a LINK_FORCE_DOWN message to each peer every
> time we come up, before we register the NTB device. If a surviving
> host still thinks the link is up it will take it down immediately.
> In this way, once the crashed host comes up fully, it will send a
> regular link up event as per usual and the link will be properly
> restarted.
>
> While we are in the area, this also fixes the MSG_LINK_UP message that
> was in the link down function that was reported by Doug Meyers.

So, this only applies on the ntb-next branch, and applying it after
your multi-partition patch series causes it to fail to apply.  The
fixup appears to be trivial, but please double check me.

Thanks,
Jon

> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
> Reported-by: ThanhTuThai <cruisethai@gmail.com>
> ---
>
> I had a small epiphany over the weekend as to why the link still was
> not coming up after a hard reboot. I think this will fix it.
>
> @Thai Thu: Can you please retest?
>
> Changes in v2:
>  * Reinitialize the shared memory window on a forced link down
>
>  drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 57 +++++++++++++++++++++++++++++-----
>  1 file changed, 50 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
> index 709f37fbe232..6bc213cf7bf6 100644
> --- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
> +++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
> @@ -118,6 +118,7 @@ struct switchtec_ntb {
>         bool link_is_up;
>         enum ntb_speed link_speed;
>         enum ntb_width link_width;
> +       struct work_struct link_reinit_work;
>  };
>
>  static struct switchtec_ntb *ntb_sndev(struct ntb_dev *ntb)
> @@ -463,18 +464,43 @@ static void switchtec_ntb_set_link_speed(struct switchtec_ntb *sndev)
>         sndev->link_width = min(self_width, peer_width);
>  }
>
> -enum {
> +enum switchtec_msg {
>         LINK_MESSAGE = 0,
>         MSG_LINK_UP = 1,
>         MSG_LINK_DOWN = 2,
>         MSG_CHECK_LINK = 3,
> +       MSG_LINK_FORCE_DOWN = 4,
>  };
>
> -static void switchtec_ntb_check_link(struct switchtec_ntb *sndev)
> +static int switchtec_ntb_reinit_peer(struct switchtec_ntb *sndev);
> +
> +static void link_reinit_work(struct work_struct *work)
> +{
> +       struct switchtec_ntb *sndev;
> +
> +       sndev = container_of(work, struct switchtec_ntb, link_reinit_work);
> +
> +       switchtec_ntb_reinit_peer(sndev);
> +}
> +
> +static void switchtec_ntb_check_link(struct switchtec_ntb *sndev,
> +                                    enum switchtec_msg msg)
>  {
>         int link_sta;
>         int old = sndev->link_is_up;
>
> +       if (msg == MSG_LINK_FORCE_DOWN) {
> +               schedule_work(&sndev->link_reinit_work);
> +
> +               if (sndev->link_is_up) {
> +                       sndev->link_is_up = 0;
> +                       ntb_link_event(&sndev->ntb);
> +                       dev_info(&sndev->stdev->dev, "ntb link forced down\n");
> +               }
> +
> +               return;
> +       }
> +
>         link_sta = sndev->self_shared->link_sta;
>         if (link_sta) {
>                 u64 peer = ioread64(&sndev->peer_shared->magic);
> @@ -500,7 +526,7 @@ static void switchtec_ntb_link_notification(struct switchtec_dev *stdev)
>  {
>         struct switchtec_ntb *sndev = stdev->sndev;
>
> -       switchtec_ntb_check_link(sndev);
> +       switchtec_ntb_check_link(sndev, MSG_CHECK_LINK);
>  }
>
>  static u64 switchtec_ntb_link_is_up(struct ntb_dev *ntb,
> @@ -528,7 +554,7 @@ static int switchtec_ntb_link_enable(struct ntb_dev *ntb,
>         sndev->self_shared->link_sta = 1;
>         switchtec_ntb_send_msg(sndev, LINK_MESSAGE, MSG_LINK_UP);
>
> -       switchtec_ntb_check_link(sndev);
> +       switchtec_ntb_check_link(sndev, MSG_CHECK_LINK);
>
>         return 0;
>  }
> @@ -540,9 +566,9 @@ static int switchtec_ntb_link_disable(struct ntb_dev *ntb)
>         dev_dbg(&sndev->stdev->dev, "disabling link\n");
>
>         sndev->self_shared->link_sta = 0;
> -       switchtec_ntb_send_msg(sndev, LINK_MESSAGE, MSG_LINK_UP);
> +       switchtec_ntb_send_msg(sndev, LINK_MESSAGE, MSG_LINK_DOWN);
>
> -       switchtec_ntb_check_link(sndev);
> +       switchtec_ntb_check_link(sndev, MSG_CHECK_LINK);
>
>         return 0;
>  }
> @@ -785,6 +811,8 @@ static void switchtec_ntb_init_sndev(struct switchtec_ntb *sndev)
>         sndev->ntb.topo = NTB_TOPO_SWITCH;
>         sndev->ntb.ops = &switchtec_ntb_ops;
>
> +       INIT_WORK(&sndev->link_reinit_work, link_reinit_work);
> +
>         sndev->self_partition = sndev->stdev->partition;
>
>         sndev->mmio_ntb = sndev->stdev->mmio_ntb;
> @@ -1062,7 +1090,7 @@ static irqreturn_t switchtec_ntb_message_isr(int irq, void *dev)
>                         iowrite8(1, &sndev->mmio_self_dbmsg->imsg[i].status);
>
>                         if (i == LINK_MESSAGE)
> -                               switchtec_ntb_check_link(sndev);
> +                               switchtec_ntb_check_link(sndev, msg);
>                 }
>         }
>
> @@ -1123,6 +1151,14 @@ static void switchtec_ntb_deinit_db_msg_irq(struct switchtec_ntb *sndev)
>         free_irq(sndev->message_irq, sndev);
>  }
>
> +static int switchtec_ntb_reinit_peer(struct switchtec_ntb *sndev)
> +{
> +       dev_info(&sndev->stdev->dev, "peer reinitialized\n");
> +       switchtec_ntb_deinit_shared_mw(sndev);
> +       switchtec_ntb_init_mw(sndev);
> +       return switchtec_ntb_init_shared_mw(sndev);
> +}
> +
>  static int switchtec_ntb_add(struct device *dev,
>                              struct class_interface *class_intf)
>  {
> @@ -1160,6 +1196,13 @@ static int switchtec_ntb_add(struct device *dev,
>         if (rc)
>                 goto deinit_shared_and_exit;
>
> +       /*
> +        * If this host crashed, the other host may think the link is
> +        * still up. Tell them to force it down (it will go back up
> +        * once we register the ntb device).
> +        */
> +       switchtec_ntb_send_msg(sndev, LINK_MESSAGE, MSG_LINK_FORCE_DOWN);
> +
>         rc = ntb_register_device(&sndev->ntb);
>         if (rc)
>                 goto deinit_and_exit;
> --
> 2.11.0
>
> --
> You received this message because you are subscribed to the Google Groups "linux-ntb" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to linux-ntb+unsubscribe@googlegroups.com.
> To post to this group, send email to linux-ntb@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/linux-ntb/20171204175721.10322-1-logang%40deltatee.com.
> For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] ntb_hw_switchtec: Force down the link before initializing
  2017-12-05 19:30 ` Jon Mason
@ 2017-12-05 19:46   ` Logan Gunthorpe
  0 siblings, 0 replies; 9+ messages in thread
From: Logan Gunthorpe @ 2017-12-05 19:46 UTC (permalink / raw)
  To: Jon Mason; +Cc: linux-ntb, ThanhTuThai



On 05/12/17 12:30 PM, Jon Mason wrote:
> On Mon, Dec 4, 2017 at 12:57 PM, Logan Gunthorpe <logang@deltatee.com> wrote:
>> If one host crashes and soft reboots, the other host may not see a
>> link down event. Then when the crashed host comes back up, the
>> surviving host may not know the link was reset and the NTB clients
>> may not work without being reset.
>>
>> To solve this, we send a LINK_FORCE_DOWN message to each peer every
>> time we come up, before we register the NTB device. If a surviving
>> host still thinks the link is up it will take it down immediately.
>> In this way, once the crashed host comes up fully, it will send a
>> regular link up event as per usual and the link will be properly
>> restarted.
>>
>> While we are in the area, this also fixes the MSG_LINK_UP message that
>> was in the link down function that was reported by Doug Meyers.
> 
> So, this only applies on the ntb-next branch, and applying it after
> your multi-partition patch series causes it to fail to apply.  The
> fixup appears to be trivial, but please double check me.

Oops sorry. I think I checked that v1 merged cleanly but didn't look at 
v2. I double checked the merged commit in ntb-next and it looks good. 
I'll do some testing on ntb-next later in the week.

Thanks,

Logan

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2017-12-05 19:46 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-04 17:57 [PATCH v2] ntb_hw_switchtec: Force down the link before initializing Logan Gunthorpe
2017-12-05  1:47 ` ThanhTuThai
2017-12-05  3:00   ` Logan Gunthorpe
2017-12-05  3:34     ` ThanhTuThai
2017-12-05  3:44       ` Logan Gunthorpe
     [not found]         ` <CAJQW-Q5p2aY2Bahx49MG24HeYvGVmW-dDBJ_aHLAqjVKGS-zpQ@mail.gmail.com>
2017-12-05  6:20           ` Logan Gunthorpe
2017-12-05 18:51             ` Jon Mason
2017-12-05 19:30 ` Jon Mason
2017-12-05 19:46   ` Logan Gunthorpe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.