linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v1] Bluetooth: Add ncmd=0 recovery handling
@ 2021-03-19 20:16 Manish Mandlik
  2021-03-20 17:33 ` Marcel Holtmann
  0 siblings, 1 reply; 2+ messages in thread
From: Manish Mandlik @ 2021-03-19 20:16 UTC (permalink / raw)
  To: marcel, luiz.dentz
  Cc: Alain Michaud, chromeos-bluetooth-upstreaming, linux-bluetooth,
	Manish Mandlik, Abhishek Pandit-Subedi, David S. Miller,
	Jakub Kicinski, Johan Hedberg, linux-kernel, netdev

During command status or command complete event, the controller may set
ncmd=0 indicating that it is not accepting any more commands. In such a
case, host holds off sending any more commands to the controller. If the
controller doesn't recover from such condition, host will wait forever.

This patch adds a timer when controller gets into such condition and
resets the controller if controller doesn't recover within the timeout
period.

Reviewed-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Signed-off-by: Manish Mandlik <mmandlik@google.com>
---
Hello Maintainers,

We noticed that during suspend, sometimes the controller firmware gets
into a state where it is not accepting any more commands (it returns
ncmd=0 in Command Status):

< HCI Command: Disconnect (0x01|0x0006) plen 3  #398 [hci0] 83.760502
        Handle: 1
        Reason: Remote Device Terminated due to Power Off (0x15)
> HCI Event: Command Status (0x0f) plen 4       #399 [hci0] 83.761694
      Disconnect (0x01|0x0006) ncmd 0
        Status: Success (0x00)

In such a case, the host holds off sending any more packets to the
controller until it is ready to accept more commands. If the controller
doesn't recover from such a condition, Command Timeout does not get
triggered as Command Timeout is queued only once the packet is sent to
the controller; hence, the host will wait forever. 

This patch adds a timer to recover from this condition. Since the
suspend timeout is 2 seconds, I'm using 4 seconds timeout to recover
from ncmd=0. This should give ample amount of time for recovery and
should not create any race conditions with the suspend. Once we resume
from the suspend normally, the timer would expire and reset the
controller. I have verified this patch locally and able to connect to
peer device after resume from suspend. Please let me know your thoughts
on this.

Thanks,
Manish.

 include/net/bluetooth/hci.h      |  1 +
 include/net/bluetooth/hci_core.h |  1 +
 net/bluetooth/hci_core.c         | 15 +++++++++++++++
 net/bluetooth/hci_event.c        | 10 ++++++++++
 4 files changed, 27 insertions(+)

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index ea4ae551c426..c4b0650fb9ae 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -339,6 +339,7 @@ enum {
 #define HCI_PAIRING_TIMEOUT	msecs_to_jiffies(60000)	/* 60 seconds */
 #define HCI_INIT_TIMEOUT	msecs_to_jiffies(10000)	/* 10 seconds */
 #define HCI_CMD_TIMEOUT		msecs_to_jiffies(2000)	/* 2 seconds */
+#define HCI_NCMD_TIMEOUT	msecs_to_jiffies(4000)	/* 4 seconds */
 #define HCI_ACL_TX_TIMEOUT	msecs_to_jiffies(45000)	/* 45 seconds */
 #define HCI_AUTO_OFF_TIMEOUT	msecs_to_jiffies(2000)	/* 2 seconds */
 #define HCI_POWER_OFF_TIMEOUT	msecs_to_jiffies(5000)	/* 5 seconds */
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index ebdd4afe30d2..f14692b39fd5 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -470,6 +470,7 @@ struct hci_dev {
 	struct delayed_work	service_cache;
 
 	struct delayed_work	cmd_timer;
+	struct delayed_work	ncmd_timer;
 
 	struct work_struct	rx_work;
 	struct work_struct	cmd_work;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index b0d9c36acc03..5ee1609456bd 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2769,6 +2769,20 @@ static void hci_cmd_timeout(struct work_struct *work)
 	queue_work(hdev->workqueue, &hdev->cmd_work);
 }
 
+/* HCI ncmd timer function */
+static void hci_ncmd_timeout(struct work_struct *work)
+{
+	struct hci_dev *hdev = container_of(work, struct hci_dev,
+					    ncmd_timer.work);
+
+	bt_dev_err(hdev, "ncmd timeout");
+
+	if (hci_dev_do_close(hdev))
+		return;
+
+	hci_dev_do_open(hdev);
+}
+
 struct oob_data *hci_find_remote_oob_data(struct hci_dev *hdev,
 					  bdaddr_t *bdaddr, u8 bdaddr_type)
 {
@@ -3831,6 +3845,7 @@ struct hci_dev *hci_alloc_dev(void)
 	init_waitqueue_head(&hdev->suspend_wait_q);
 
 	INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout);
+	INIT_DELAYED_WORK(&hdev->ncmd_timer, hci_ncmd_timeout);
 
 	hci_request_setup(hdev);
 
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index cf2f4a0abdbd..114a9170d809 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -3635,6 +3635,11 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
 	if (*opcode != HCI_OP_NOP)
 		cancel_delayed_work(&hdev->cmd_timer);
 
+	if (!ev->ncmd && !test_bit(HCI_RESET, &hdev->flags))
+		schedule_delayed_work(&hdev->ncmd_timer, HCI_NCMD_TIMEOUT);
+	else
+		cancel_delayed_work(&hdev->ncmd_timer);
+
 	if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags))
 		atomic_set(&hdev->cmd_cnt, 1);
 
@@ -3740,6 +3745,11 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb,
 	if (*opcode != HCI_OP_NOP)
 		cancel_delayed_work(&hdev->cmd_timer);
 
+	if (!ev->ncmd && !test_bit(HCI_RESET, &hdev->flags))
+		schedule_delayed_work(&hdev->ncmd_timer, HCI_NCMD_TIMEOUT);
+	else
+		cancel_delayed_work(&hdev->ncmd_timer);
+
 	if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags))
 		atomic_set(&hdev->cmd_cnt, 1);
 
-- 
2.31.0.rc2.261.g7f71774620-goog


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH v1] Bluetooth: Add ncmd=0 recovery handling
  2021-03-19 20:16 [PATCH v1] Bluetooth: Add ncmd=0 recovery handling Manish Mandlik
@ 2021-03-20 17:33 ` Marcel Holtmann
  0 siblings, 0 replies; 2+ messages in thread
From: Marcel Holtmann @ 2021-03-20 17:33 UTC (permalink / raw)
  To: Manish Mandlik
  Cc: Luiz Augusto von Dentz, Alain Michaud, CrosBT Upstreaming,
	Bluetooth Kernel Mailing List, Abhishek Pandit-Subedi,
	David S. Miller, Jakub Kicinski, Johan Hedberg, LKML, netdev

Hi Manish,

> During command status or command complete event, the controller may set
> ncmd=0 indicating that it is not accepting any more commands. In such a
> case, host holds off sending any more commands to the controller. If the
> controller doesn't recover from such condition, host will wait forever.
> 
> This patch adds a timer when controller gets into such condition and
> resets the controller if controller doesn't recover within the timeout
> period.
> 
> Reviewed-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
> Signed-off-by: Manish Mandlik <mmandlik@google.com>
> ---
> Hello Maintainers,
> 
> We noticed that during suspend, sometimes the controller firmware gets
> into a state where it is not accepting any more commands (it returns
> ncmd=0 in Command Status):
> 
> < HCI Command: Disconnect (0x01|0x0006) plen 3  #398 [hci0] 83.760502
>         Handle: 1
>         Reason: Remote Device Terminated due to Power Off (0x15)
>> HCI Event: Command Status (0x0f) plen 4       #399 [hci0] 83.761694
>       Disconnect (0x01|0x0006) ncmd 0
>         Status: Success (0x00)
> 
> In such a case, the host holds off sending any more packets to the
> controller until it is ready to accept more commands. If the controller
> doesn't recover from such a condition, Command Timeout does not get
> triggered as Command Timeout is queued only once the packet is sent to
> the controller; hence, the host will wait forever. 
> 
> This patch adds a timer to recover from this condition. Since the
> suspend timeout is 2 seconds, I'm using 4 seconds timeout to recover
> from ncmd=0. This should give ample amount of time for recovery and
> should not create any race conditions with the suspend. Once we resume
> from the suspend normally, the timer would expire and reset the
> controller. I have verified this patch locally and able to connect to
> peer device after resume from suspend. Please let me know your thoughts
> on this.
> 
> Thanks,
> Manish.
> 
> include/net/bluetooth/hci.h      |  1 +
> include/net/bluetooth/hci_core.h |  1 +
> net/bluetooth/hci_core.c         | 15 +++++++++++++++
> net/bluetooth/hci_event.c        | 10 ++++++++++
> 4 files changed, 27 insertions(+)
> 
> diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
> index ea4ae551c426..c4b0650fb9ae 100644
> --- a/include/net/bluetooth/hci.h
> +++ b/include/net/bluetooth/hci.h
> @@ -339,6 +339,7 @@ enum {
> #define HCI_PAIRING_TIMEOUT	msecs_to_jiffies(60000)	/* 60 seconds */
> #define HCI_INIT_TIMEOUT	msecs_to_jiffies(10000)	/* 10 seconds */
> #define HCI_CMD_TIMEOUT		msecs_to_jiffies(2000)	/* 2 seconds */
> +#define HCI_NCMD_TIMEOUT	msecs_to_jiffies(4000)	/* 4 seconds */
> #define HCI_ACL_TX_TIMEOUT	msecs_to_jiffies(45000)	/* 45 seconds */
> #define HCI_AUTO_OFF_TIMEOUT	msecs_to_jiffies(2000)	/* 2 seconds */
> #define HCI_POWER_OFF_TIMEOUT	msecs_to_jiffies(5000)	/* 5 seconds */
> diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
> index ebdd4afe30d2..f14692b39fd5 100644
> --- a/include/net/bluetooth/hci_core.h
> +++ b/include/net/bluetooth/hci_core.h
> @@ -470,6 +470,7 @@ struct hci_dev {
> 	struct delayed_work	service_cache;
> 
> 	struct delayed_work	cmd_timer;
> +	struct delayed_work	ncmd_timer;
> 
> 	struct work_struct	rx_work;
> 	struct work_struct	cmd_work;
> diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
> index b0d9c36acc03..5ee1609456bd 100644
> --- a/net/bluetooth/hci_core.c
> +++ b/net/bluetooth/hci_core.c
> @@ -2769,6 +2769,20 @@ static void hci_cmd_timeout(struct work_struct *work)
> 	queue_work(hdev->workqueue, &hdev->cmd_work);
> }
> 
> +/* HCI ncmd timer function */
> +static void hci_ncmd_timeout(struct work_struct *work)
> +{
> +	struct hci_dev *hdev = container_of(work, struct hci_dev,
> +					    ncmd_timer.work);
> +
> +	bt_dev_err(hdev, "ncmd timeout");
> +
> +	if (hci_dev_do_close(hdev))
> +		return;
> +
> +	hci_dev_do_open(hdev);
> +}
> +

I am pretty certain this can dead-lock if ncmd=0 happens inside hci_dev_do_open,do_close itself.

The second thing is that do_close+do_open is heavy hammer you are swinging here. It will also result in mgmt powered down/up. Is this something you really want since bluetoothd will notice this and has to re-init everything.

Regards

Marcel


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2021-03-20 17:34 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-19 20:16 [PATCH v1] Bluetooth: Add ncmd=0 recovery handling Manish Mandlik
2021-03-20 17:33 ` Marcel Holtmann

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).