All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] fixes for hangs and error reporting in CDC_WDM
@ 2020-09-22 11:21 Oliver Neukum
  2020-09-22 11:21 ` [RFC 1/7] CDC-WDM: fix hangs in flush() in multithreaded cases Oliver Neukum
                   ` (7 more replies)
  0 siblings, 8 replies; 19+ messages in thread
From: Oliver Neukum @ 2020-09-22 11:21 UTC (permalink / raw)
  To: penguin-kernel, bjorn, linux-usb

Stress testing has shown that CDC-WDM has some issues with hangs
and error reporting

1. wakeups are not correctly handled in multhreaded environments
2. unresponsive hardware is not handled
3. errors are not correctly reported. This needs flush() to be
implemented.

This version makes wdm_flush() use interruptible sleep.

For easier review all squashed together:

--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -58,6 +58,9 @@ MODULE_DEVICE_TABLE (usb, wdm_ids);
 
 #define WDM_MAX                        16
 
+/* flush() is uninterruptible, but we cannot wait forever */
+#define WDM_FLUSH_TIMEOUT      (30 * HZ)
+
 /* CDC-WMC r1.1 requires wMaxCommand to be "at least 256 decimal (0x100)" */
 #define WDM_DEFAULT_BUFSIZE    256
 
@@ -151,7 +154,7 @@ static void wdm_out_callback(struct urb *urb)
        kfree(desc->outbuf);
        desc->outbuf = NULL;
        clear_bit(WDM_IN_USE, &desc->flags);
-       wake_up(&desc->wait);
+       wake_up_all(&desc->wait);
 }
 
 static void wdm_in_callback(struct urb *urb)
@@ -393,6 +396,9 @@ static ssize_t wdm_write
        if (test_bit(WDM_RESETTING, &desc->flags))
                r = -EIO;
 
+       if (test_bit(WDM_DISCONNECTING, &desc->flags))
+               r = -ENODEV;
+
        if (r < 0) {
                rv = r;
                goto out_free_mem_pm;
@@ -424,6 +430,7 @@ static ssize_t wdm_write
        if (rv < 0) {
                desc->outbuf = NULL;
                clear_bit(WDM_IN_USE, &desc->flags);
+               wake_up_all(&desc->wait); /* for flush() */
                dev_err(&desc->intf->dev, "Tx URB error: %d\n", rv);
                rv = usb_translate_errors(rv);
                goto out_free_mem_pm;
@@ -583,11 +590,39 @@ static ssize_t wdm_read
        return rv;
 }
 
+/*
+ * The difference to flush is that we wait forever. If you don't like
+ * that behavior, you need to send a signal.
+ */
+
+static int wdm_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+       struct wdm_device *desc = file->private_data;
+       int rv;
+
+       rv = wait_event_interruptible(desc->wait,
+                       !test_bit(WDM_IN_USE, &desc->flags) ||
+                       test_bit(WDM_DISCONNECTING, &desc->flags));
+
+       if (test_bit(WDM_DISCONNECTING, &desc->flags))
+               return -ENODEV;
+       if (rv < 0)
+               return -EINTR;
+
+       spin_lock_irq(&desc->iuspin);
+       rv = desc->werr;
+       desc->werr = 0;
+       spin_unlock_irq(&desc->iuspin);
+
+       return usb_translate_errors(rv);
+}
+
 static int wdm_flush(struct file *file, fl_owner_t id)
 {
        struct wdm_device *desc = file->private_data;
+       int rv;
 
-       wait_event(desc->wait,
+       rv = wait_event_interruptible_timeout(desc->wait,
                        /*
                         * needs both flags. We cannot do with one
                         * because resetting it would cause a race
@@ -595,16 +630,27 @@ static int wdm_flush(struct file *file, fl_owner_t id)
                         * a disconnect
                         */
                        !test_bit(WDM_IN_USE, &desc->flags) ||
-                       test_bit(WDM_DISCONNECTING, &desc->flags));
+                       test_bit(WDM_DISCONNECTING, &desc->flags),
+                       WDM_FLUSH_TIMEOUT);
 
-       /* cannot dereference desc->intf if WDM_DISCONNECTING */
+       /*
+        * to report the correct error.
+        * This is best effort
+        * We are inevitably racing with the hardware.
+        */
        if (test_bit(WDM_DISCONNECTING, &desc->flags))
                return -ENODEV;
-       if (desc->werr < 0)
-               dev_err(&desc->intf->dev, "Error in flush path: %d\n",
-                       desc->werr);
+       if (!rv)
+               return -EIO;
+       if (rv < 0)
+               return -EINTR;
+
+       spin_lock_irq(&desc->iuspin);
+       rv = desc->werr;
+       desc->werr = 0;
+       spin_unlock_irq(&desc->iuspin);
 
-       return usb_translate_errors(desc->werr);
+       return usb_translate_errors(rv);
 }
 
 static __poll_t wdm_poll(struct file *file, struct poll_table_struct *wait)
@@ -729,6 +775,7 @@ static const struct file_operations wdm_fops = {
        .owner =        THIS_MODULE,
        .read =         wdm_read,
        .write =        wdm_write,
+       .fsync =        wdm_fsync,
        .open =         wdm_open,
        .flush =        wdm_flush,
        .release =      wdm_release,



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [RFC 1/7] CDC-WDM: fix hangs in flush() in multithreaded cases
  2020-09-22 11:21 [RFC] fixes for hangs and error reporting in CDC_WDM Oliver Neukum
@ 2020-09-22 11:21 ` Oliver Neukum
  2020-09-22 11:21 ` [RFC 2/7] CDC-WDM: introduce a timeout in flush() Oliver Neukum
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 19+ messages in thread
From: Oliver Neukum @ 2020-09-22 11:21 UTC (permalink / raw)
  To: penguin-kernel, bjorn, linux-usb; +Cc: Oliver Neukum

In a multithreaded scenario an arbitrary number of threads
can be in wdm_write() and in wdm_flush(). Hence whenever
WDM_IN_USE is reset, all possible waiters need to be notified.
This is true whether this is due to IO completing or
to an error starting IO.

Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Oliver Neukum <oneukum@suse.com>
---
 drivers/usb/class/cdc-wdm.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index e3db6fbeadef..adb3fc307083 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -151,7 +151,7 @@ static void wdm_out_callback(struct urb *urb)
 	kfree(desc->outbuf);
 	desc->outbuf = NULL;
 	clear_bit(WDM_IN_USE, &desc->flags);
-	wake_up(&desc->wait);
+	wake_up_all(&desc->wait);
 }
 
 static void wdm_in_callback(struct urb *urb)
@@ -424,6 +424,7 @@ static ssize_t wdm_write
 	if (rv < 0) {
 		desc->outbuf = NULL;
 		clear_bit(WDM_IN_USE, &desc->flags);
+		wake_up_all(&desc->wait); /* for flush() */
 		dev_err(&desc->intf->dev, "Tx URB error: %d\n", rv);
 		rv = usb_translate_errors(rv);
 		goto out_free_mem_pm;
@@ -586,6 +587,7 @@ static ssize_t wdm_read
 static int wdm_flush(struct file *file, fl_owner_t id)
 {
 	struct wdm_device *desc = file->private_data;
+	int rv;
 
 	wait_event(desc->wait,
 			/*
@@ -600,11 +602,12 @@ static int wdm_flush(struct file *file, fl_owner_t id)
 	/* cannot dereference desc->intf if WDM_DISCONNECTING */
 	if (test_bit(WDM_DISCONNECTING, &desc->flags))
 		return -ENODEV;
-	if (desc->werr < 0)
+	rv = desc->werr;
+	if (rv < 0)
 		dev_err(&desc->intf->dev, "Error in flush path: %d\n",
-			desc->werr);
+			rv);
 
-	return usb_translate_errors(desc->werr);
+	return usb_translate_errors(rv);
 }
 
 static __poll_t wdm_poll(struct file *file, struct poll_table_struct *wait)
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [RFC 2/7] CDC-WDM: introduce a timeout in flush()
  2020-09-22 11:21 [RFC] fixes for hangs and error reporting in CDC_WDM Oliver Neukum
  2020-09-22 11:21 ` [RFC 1/7] CDC-WDM: fix hangs in flush() in multithreaded cases Oliver Neukum
@ 2020-09-22 11:21 ` Oliver Neukum
  2020-09-22 11:21 ` [RFC 3/7] CDC-WDM: remove use of intf->dev after potential disconnect Oliver Neukum
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 19+ messages in thread
From: Oliver Neukum @ 2020-09-22 11:21 UTC (permalink / raw)
  To: penguin-kernel, bjorn, linux-usb; +Cc: Oliver Neukum

Malicious or defective hardware may cease communication while
flush() is running. In last consequence we need a timeout,
as hardware that remains silent must be ignored.

Making the wait for IO interruptible would not solve the
issue. While it would avoid a hang, it would not allow any progress
and we would end up with an unclosable fd.

The 30 seconds are coming out of thin air.

Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Oliver Neukum <oneukum@suse.com>
---
 drivers/usb/class/cdc-wdm.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index adb3fc307083..b3a3f249a915 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -58,6 +58,9 @@ MODULE_DEVICE_TABLE (usb, wdm_ids);
 
 #define WDM_MAX			16
 
+/* flush() is uninterruptible, but we cannot wait forever */
+#define WDM_FLUSH_TIMEOUT	(30 * HZ)
+
 /* CDC-WMC r1.1 requires wMaxCommand to be "at least 256 decimal (0x100)" */
 #define WDM_DEFAULT_BUFSIZE	256
 
@@ -589,7 +592,7 @@ static int wdm_flush(struct file *file, fl_owner_t id)
 	struct wdm_device *desc = file->private_data;
 	int rv;
 
-	wait_event(desc->wait,
+	rv = wait_event_timeout(desc->wait,
 			/*
 			 * needs both flags. We cannot do with one
 			 * because resetting it would cause a race
@@ -597,11 +600,15 @@ static int wdm_flush(struct file *file, fl_owner_t id)
 			 * a disconnect
 			 */
 			!test_bit(WDM_IN_USE, &desc->flags) ||
-			test_bit(WDM_DISCONNECTING, &desc->flags));
+			test_bit(WDM_DISCONNECTING, &desc->flags),
+			WDM_FLUSH_TIMEOUT);
 
 	/* cannot dereference desc->intf if WDM_DISCONNECTING */
 	if (test_bit(WDM_DISCONNECTING, &desc->flags))
 		return -ENODEV;
+	if (!rv)
+		return -EIO;
+
 	rv = desc->werr;
 	if (rv < 0)
 		dev_err(&desc->intf->dev, "Error in flush path: %d\n",
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [RFC 3/7] CDC-WDM: remove use of intf->dev after potential disconnect
  2020-09-22 11:21 [RFC] fixes for hangs and error reporting in CDC_WDM Oliver Neukum
  2020-09-22 11:21 ` [RFC 1/7] CDC-WDM: fix hangs in flush() in multithreaded cases Oliver Neukum
  2020-09-22 11:21 ` [RFC 2/7] CDC-WDM: introduce a timeout in flush() Oliver Neukum
@ 2020-09-22 11:21 ` Oliver Neukum
  2020-09-22 11:21 ` [RFC 4/7] CDC-WDM: fix race reporting errors in flush Oliver Neukum
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 19+ messages in thread
From: Oliver Neukum @ 2020-09-22 11:21 UTC (permalink / raw)
  To: penguin-kernel, bjorn, linux-usb; +Cc: Oliver Neukum

After a disconnect intf->dev is not a valid pointer any longer.
As flush() uses it only for logging purposes logging is not
worth it. Remove the dev_err()

Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Oliver Neukum <oneukum@suse.com>
---
 drivers/usb/class/cdc-wdm.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index b3a3f249a915..1ca2c85a977d 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -603,16 +603,17 @@ static int wdm_flush(struct file *file, fl_owner_t id)
 			test_bit(WDM_DISCONNECTING, &desc->flags),
 			WDM_FLUSH_TIMEOUT);
 
-	/* cannot dereference desc->intf if WDM_DISCONNECTING */
+	/*
+	 * to report the correct error.
+	 * This is best effort
+	 * We are inevitably racing with the hardware.
+	 */
 	if (test_bit(WDM_DISCONNECTING, &desc->flags))
 		return -ENODEV;
 	if (!rv)
 		return -EIO;
 
 	rv = desc->werr;
-	if (rv < 0)
-		dev_err(&desc->intf->dev, "Error in flush path: %d\n",
-			rv);
 
 	return usb_translate_errors(rv);
 }
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [RFC 4/7] CDC-WDM: fix race reporting errors in flush
  2020-09-22 11:21 [RFC] fixes for hangs and error reporting in CDC_WDM Oliver Neukum
                   ` (2 preceding siblings ...)
  2020-09-22 11:21 ` [RFC 3/7] CDC-WDM: remove use of intf->dev after potential disconnect Oliver Neukum
@ 2020-09-22 11:21 ` Oliver Neukum
  2020-09-22 11:21 ` [RFC 5/7] CDC-WDM: correct error reporting in write() Oliver Neukum
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 19+ messages in thread
From: Oliver Neukum @ 2020-09-22 11:21 UTC (permalink / raw)
  To: penguin-kernel, bjorn, linux-usb; +Cc: Oliver Neukum

In case a race was lost and multiple fds used,
an error could be reported multiple times. To fix
this a spinlock must be taken.

Signed-off-by: Oliver Neukum <oneukum@suse.com>
---
 drivers/usb/class/cdc-wdm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 1ca2c85a977d..5fb855404403 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -613,7 +613,10 @@ static int wdm_flush(struct file *file, fl_owner_t id)
 	if (!rv)
 		return -EIO;
 
+	spin_lock_irq(&desc->iuspin);
 	rv = desc->werr;
+	desc->werr = 0;
+	spin_unlock_irq(&desc->iuspin);
 
 	return usb_translate_errors(rv);
 }
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [RFC 5/7] CDC-WDM: correct error reporting in write()
  2020-09-22 11:21 [RFC] fixes for hangs and error reporting in CDC_WDM Oliver Neukum
                   ` (3 preceding siblings ...)
  2020-09-22 11:21 ` [RFC 4/7] CDC-WDM: fix race reporting errors in flush Oliver Neukum
@ 2020-09-22 11:21 ` Oliver Neukum
  2020-09-22 11:21 ` [RFC 6/7] CDC-WDM: implement fsync Oliver Neukum
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 19+ messages in thread
From: Oliver Neukum @ 2020-09-22 11:21 UTC (permalink / raw)
  To: penguin-kernel, bjorn, linux-usb; +Cc: Oliver Neukum

In case our wait was interrupted by a disconnect, we should report
that.

Signed-off-by: Oliver Neukum <oneukum@suse.com>
---
 drivers/usb/class/cdc-wdm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 5fb855404403..242f83118208 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -396,6 +396,9 @@ static ssize_t wdm_write
 	if (test_bit(WDM_RESETTING, &desc->flags))
 		r = -EIO;
 
+	if (test_bit(WDM_DISCONNECTING, &desc->flags))
+		r = -ENODEV;
+
 	if (r < 0) {
 		rv = r;
 		goto out_free_mem_pm;
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [RFC 6/7] CDC-WDM: implement fsync
  2020-09-22 11:21 [RFC] fixes for hangs and error reporting in CDC_WDM Oliver Neukum
                   ` (4 preceding siblings ...)
  2020-09-22 11:21 ` [RFC 5/7] CDC-WDM: correct error reporting in write() Oliver Neukum
@ 2020-09-22 11:21 ` Oliver Neukum
  2020-09-22 11:21 ` [RFC 7/7] CDC-WDM: making flush() interruptible Oliver Neukum
  2020-09-25 15:11 ` [RFC] fixes for hangs and error reporting in CDC_WDM Greg KH
  7 siblings, 0 replies; 19+ messages in thread
From: Oliver Neukum @ 2020-09-22 11:21 UTC (permalink / raw)
  To: penguin-kernel, bjorn, linux-usb; +Cc: Oliver Neukum

Some users want to be very sure that data has gone out to the
device. This needs fsync.

Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Oliver Neukum <oneukum@suse.com>
---
 drivers/usb/class/cdc-wdm.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 242f83118208..6ea03c12380c 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -590,6 +590,33 @@ static ssize_t wdm_read
 	return rv;
 }
 
+/*
+ * The difference to flush is that we wait forever. If you don't like
+ * that behavior, you need to send a signal.
+ */
+
+static int wdm_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct wdm_device *desc = file->private_data;
+	int rv;
+
+	rv = wait_event_interruptible(desc->wait,
+			!test_bit(WDM_IN_USE, &desc->flags) ||
+			test_bit(WDM_DISCONNECTING, &desc->flags));
+
+	if (test_bit(WDM_DISCONNECTING, &desc->flags))
+		return -ENODEV;
+	if (rv < 0)
+		return -EINTR;
+
+	spin_lock_irq(&desc->iuspin);
+	rv = desc->werr;
+	desc->werr = 0;
+	spin_unlock_irq(&desc->iuspin);
+
+	return usb_translate_errors(rv);
+}
+
 static int wdm_flush(struct file *file, fl_owner_t id)
 {
 	struct wdm_device *desc = file->private_data;
@@ -746,6 +773,7 @@ static const struct file_operations wdm_fops = {
 	.owner =	THIS_MODULE,
 	.read =		wdm_read,
 	.write =	wdm_write,
+	.fsync =	wdm_fsync,
 	.open =		wdm_open,
 	.flush =	wdm_flush,
 	.release =	wdm_release,
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [RFC 7/7] CDC-WDM: making flush() interruptible
  2020-09-22 11:21 [RFC] fixes for hangs and error reporting in CDC_WDM Oliver Neukum
                   ` (5 preceding siblings ...)
  2020-09-22 11:21 ` [RFC 6/7] CDC-WDM: implement fsync Oliver Neukum
@ 2020-09-22 11:21 ` Oliver Neukum
  2020-09-22 11:38   ` Tetsuo Handa
  2020-09-25 15:11 ` [RFC] fixes for hangs and error reporting in CDC_WDM Greg KH
  7 siblings, 1 reply; 19+ messages in thread
From: Oliver Neukum @ 2020-09-22 11:21 UTC (permalink / raw)
  To: penguin-kernel, bjorn, linux-usb; +Cc: Oliver Neukum

There is no need for flush() to be uninterruptible. close(2)
is allowed to return -EINTR.

30 seconds is quite long a time to sleep in an uninterruptible state.
Change it to an interruptible sleep.

Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Oliver Neukum <oneukum@suse.com>
---
 drivers/usb/class/cdc-wdm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 6ea03c12380c..b9cca1cb5058 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -622,7 +622,7 @@ static int wdm_flush(struct file *file, fl_owner_t id)
 	struct wdm_device *desc = file->private_data;
 	int rv;
 
-	rv = wait_event_timeout(desc->wait,
+	rv = wait_event_interruptible_timeout(desc->wait,
 			/*
 			 * needs both flags. We cannot do with one
 			 * because resetting it would cause a race
@@ -642,6 +642,8 @@ static int wdm_flush(struct file *file, fl_owner_t id)
 		return -ENODEV;
 	if (!rv)
 		return -EIO;
+	if (rv < 0)
+		return -EINTR;
 
 	spin_lock_irq(&desc->iuspin);
 	rv = desc->werr;
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [RFC 7/7] CDC-WDM: making flush() interruptible
  2020-09-22 11:21 ` [RFC 7/7] CDC-WDM: making flush() interruptible Oliver Neukum
@ 2020-09-22 11:38   ` Tetsuo Handa
  2020-09-22 11:49     ` Oliver Neukum
  0 siblings, 1 reply; 19+ messages in thread
From: Tetsuo Handa @ 2020-09-22 11:38 UTC (permalink / raw)
  To: Oliver Neukum; +Cc: bjorn, linux-usb

On 2020/09/22 20:21, Oliver Neukum wrote:
> There is no need for flush() to be uninterruptible. close(2)
> is allowed to return -EINTR.
> 
> 30 seconds is quite long a time to sleep in an uninterruptible state.
> Change it to an interruptible sleep.

Doesn't this conflict with

  Making the wait for IO interruptible would not solve the
  issue. While it would avoid a hang, it would not allow any progress
  and we would end up with an unclosable fd.

in [RFC 2/7] ? I suggested killable version, for giving up upon SIGKILL
implies automatically closing fds by terminating that userspace process.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC 7/7] CDC-WDM: making flush() interruptible
  2020-09-22 11:38   ` Tetsuo Handa
@ 2020-09-22 11:49     ` Oliver Neukum
  0 siblings, 0 replies; 19+ messages in thread
From: Oliver Neukum @ 2020-09-22 11:49 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: bjorn, linux-usb

Am Dienstag, den 22.09.2020, 20:38 +0900 schrieb Tetsuo Handa:
> On 2020/09/22 20:21, Oliver Neukum wrote:
> > There is no need for flush() to be uninterruptible. close(2)
> > is allowed to return -EINTR.
> > 
> > 30 seconds is quite long a time to sleep in an uninterruptible state.
> > Change it to an interruptible sleep.
> 
> Doesn't this conflict with
> 
>   Making the wait for IO interruptible would not solve the
>   issue. While it would avoid a hang, it would not allow any progress
>   and we would end up with an unclosable fd.
> 
> in [RFC 2/7] ? I suggested killable version, for giving up upon SIGKILL
> implies automatically closing fds by terminating that userspace process.

No. That we state in an earlier patch that an issue needs a timeout
instead of an interruptible sleep, does not mean that another issue
could not be solved by using an interruptible sleep.

	Regards
		Oliver


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] fixes for hangs and error reporting in CDC_WDM
  2020-09-22 11:21 [RFC] fixes for hangs and error reporting in CDC_WDM Oliver Neukum
                   ` (6 preceding siblings ...)
  2020-09-22 11:21 ` [RFC 7/7] CDC-WDM: making flush() interruptible Oliver Neukum
@ 2020-09-25 15:11 ` Greg KH
  2020-09-25 15:20   ` Tetsuo Handa
  7 siblings, 1 reply; 19+ messages in thread
From: Greg KH @ 2020-09-25 15:11 UTC (permalink / raw)
  To: Oliver Neukum; +Cc: penguin-kernel, bjorn, linux-usb

On Tue, Sep 22, 2020 at 01:21:19PM +0200, Oliver Neukum wrote:
> Stress testing has shown that CDC-WDM has some issues with hangs
> and error reporting
> 
> 1. wakeups are not correctly handled in multhreaded environments
> 2. unresponsive hardware is not handled
> 3. errors are not correctly reported. This needs flush() to be
> implemented.
> 
> This version makes wdm_flush() use interruptible sleep.
> 
> For easier review all squashed together:
> 

I have like 3 or 4 different "RFC" series here from you for this driver,
which one is the "newest"?

And can you send a series that isn't RFC so that I can know you feel it
is good enough to be merged?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] fixes for hangs and error reporting in CDC_WDM
  2020-09-25 15:11 ` [RFC] fixes for hangs and error reporting in CDC_WDM Greg KH
@ 2020-09-25 15:20   ` Tetsuo Handa
  2020-09-25 15:28     ` Greg KH
  0 siblings, 1 reply; 19+ messages in thread
From: Tetsuo Handa @ 2020-09-25 15:20 UTC (permalink / raw)
  To: Greg KH, Oliver Neukum; +Cc: bjorn, linux-usb

On 2020/09/26 0:11, Greg KH wrote:
> On Tue, Sep 22, 2020 at 01:21:19PM +0200, Oliver Neukum wrote:
>> Stress testing has shown that CDC-WDM has some issues with hangs
>> and error reporting
>>
>> 1. wakeups are not correctly handled in multhreaded environments
>> 2. unresponsive hardware is not handled
>> 3. errors are not correctly reported. This needs flush() to be
>> implemented.
>>
>> This version makes wdm_flush() use interruptible sleep.
>>
>> For easier review all squashed together:
>>
> 
> I have like 3 or 4 different "RFC" series here from you for this driver,
> which one is the "newest"?

https://lkml.kernel.org/r/20200923092136.14824-1-oneukum@suse.com

is the newest series from Oliver. But

https://lkml.kernel.org/r/b27841ab-a88c-13e2-a66f-6df7af1f46b4@i-love.sakura.ne.jp

is the squashed version with updated comments and deduplicated code.

> 
> And can you send a series that isn't RFC so that I can know you feel it
> is good enough to be merged?

Do you want this fix as a series of patches (the former link)?
Since I think that the changeset should be atomically applied, I posted the latter link.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] fixes for hangs and error reporting in CDC_WDM
  2020-09-25 15:20   ` Tetsuo Handa
@ 2020-09-25 15:28     ` Greg KH
  2020-09-25 15:34       ` Tetsuo Handa
  0 siblings, 1 reply; 19+ messages in thread
From: Greg KH @ 2020-09-25 15:28 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: Oliver Neukum, bjorn, linux-usb

On Sat, Sep 26, 2020 at 12:20:57AM +0900, Tetsuo Handa wrote:
> On 2020/09/26 0:11, Greg KH wrote:
> > On Tue, Sep 22, 2020 at 01:21:19PM +0200, Oliver Neukum wrote:
> >> Stress testing has shown that CDC-WDM has some issues with hangs
> >> and error reporting
> >>
> >> 1. wakeups are not correctly handled in multhreaded environments
> >> 2. unresponsive hardware is not handled
> >> 3. errors are not correctly reported. This needs flush() to be
> >> implemented.
> >>
> >> This version makes wdm_flush() use interruptible sleep.
> >>
> >> For easier review all squashed together:
> >>
> > 
> > I have like 3 or 4 different "RFC" series here from you for this driver,
> > which one is the "newest"?
> 
> https://lkml.kernel.org/r/20200923092136.14824-1-oneukum@suse.com
> 
> is the newest series from Oliver. But
> 
> https://lkml.kernel.org/r/b27841ab-a88c-13e2-a66f-6df7af1f46b4@i-love.sakura.ne.jp
> 
> is the squashed version with updated comments and deduplicated code.
> 
> > 
> > And can you send a series that isn't RFC so that I can know you feel it
> > is good enough to be merged?
> 
> Do you want this fix as a series of patches (the former link)?
> Since I think that the changeset should be atomically applied, I posted the latter link.

A single patch would be good to send to me again, burried at the end of
a long thread is hard to dig out.

Also with proper authorship is needed, did Oliver write this, or did
you?

There is the co-developed-by: tag, which looks like it might be relevant
here, can you do that?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] fixes for hangs and error reporting in CDC_WDM
  2020-09-25 15:28     ` Greg KH
@ 2020-09-25 15:34       ` Tetsuo Handa
  2020-09-26  5:40         ` Greg KH
  0 siblings, 1 reply; 19+ messages in thread
From: Tetsuo Handa @ 2020-09-25 15:34 UTC (permalink / raw)
  To: Greg KH, Oliver Neukum; +Cc: bjorn, linux-usb

On 2020/09/26 0:28, Greg KH wrote:
>> Do you want this fix as a series of patches (the former link)?
>> Since I think that the changeset should be atomically applied, I posted the latter link.
> 
> A single patch would be good to send to me again, burried at the end of
> a long thread is hard to dig out.
> 
> Also with proper authorship is needed, did Oliver write this, or did
> you?

Oliver wrote the majority part. I just squashed the series and updated comments
and deduplicated the code. Thus, I think main authorship should be given to Oliver.

> 
> There is the co-developed-by: tag, which looks like it might be relevant
> here, can you do that?

If you prefer the co-developed-by: tag, you can add:

Co-developed-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>

Oliver, any comments?

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] fixes for hangs and error reporting in CDC_WDM
  2020-09-25 15:34       ` Tetsuo Handa
@ 2020-09-26  5:40         ` Greg KH
  2020-09-29  8:46           ` Oliver Neukum
  0 siblings, 1 reply; 19+ messages in thread
From: Greg KH @ 2020-09-26  5:40 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: Oliver Neukum, bjorn, linux-usb

On Sat, Sep 26, 2020 at 12:34:24AM +0900, Tetsuo Handa wrote:
> On 2020/09/26 0:28, Greg KH wrote:
> >> Do you want this fix as a series of patches (the former link)?
> >> Since I think that the changeset should be atomically applied, I posted the latter link.
> > 
> > A single patch would be good to send to me again, burried at the end of
> > a long thread is hard to dig out.
> > 
> > Also with proper authorship is needed, did Oliver write this, or did
> > you?
> 
> Oliver wrote the majority part. I just squashed the series and updated comments
> and deduplicated the code. Thus, I think main authorship should be given to Oliver.
> 
> > 
> > There is the co-developed-by: tag, which looks like it might be relevant
> > here, can you do that?
> 
> If you prefer the co-developed-by: tag, you can add:
> 
> Co-developed-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>

That seems reasonable, then put Oliver's address on the first line with
a "From:" line to make this all work correctly when I apply it.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] fixes for hangs and error reporting in CDC_WDM
  2020-09-26  5:40         ` Greg KH
@ 2020-09-29  8:46           ` Oliver Neukum
  2020-09-29 10:17             ` Tetsuo Handa
  0 siblings, 1 reply; 19+ messages in thread
From: Oliver Neukum @ 2020-09-29  8:46 UTC (permalink / raw)
  To: Greg KH, Tetsuo Handa; +Cc: bjorn, linux-usb

Am Samstag, den 26.09.2020, 07:40 +0200 schrieb Greg KH:
> On Sat, Sep 26, 2020 at 12:34:24AM +0900, Tetsuo Handa wrote:
> > On 2020/09/26 0:28, Greg KH wrote:
> > > > Do you want this fix as a series of patches (the former link)?
> > > > Since I think that the changeset should be atomically applied, I posted the latter link.
> > > 
> > > A single patch would be good to send to me again, burried at the end of
> > > a long thread is hard to dig out.
> > > 
> > > Also with proper authorship is needed, did Oliver write this, or did
> > > you?
> > 
> > Oliver wrote the majority part. I just squashed the series and updated comments
> > and deduplicated the code. Thus, I think main authorship should be given to Oliver.
> > 
> > > There is the co-developed-by: tag, which looks like it might be relevant
> > > here, can you do that?
> > 
> > If you prefer the co-developed-by: tag, you can add:
> > 
> > Co-developed-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> 
> That seems reasonable, then put Oliver's address on the first line with
> a "From:" line to make this all work correctly when I apply it.

Hi,

I did sent out the series with a PATCH label last week.
Should I resend?

	Regards
		Oliver



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] fixes for hangs and error reporting in CDC_WDM
  2020-09-29  8:46           ` Oliver Neukum
@ 2020-09-29 10:17             ` Tetsuo Handa
  0 siblings, 0 replies; 19+ messages in thread
From: Tetsuo Handa @ 2020-09-29 10:17 UTC (permalink / raw)
  To: Oliver Neukum, Greg KH; +Cc: bjorn, linux-usb

On 2020/09/29 17:46, Oliver Neukum wrote:
>> That seems reasonable, then put Oliver's address on the first line with
>> a "From:" line to make this all work correctly when I apply it.
> 
> Hi,
> 
> I did sent out the series with a PATCH label last week.
> Should I resend?
> 

No need to resend. I already sent a squashed version as
https://lkml.kernel.org/r/20200928141755.3476-1-penguin-kernel@I-love.SAKURA.ne.jp .

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC] fixes for hangs and error reporting in CDC_WDM
  2020-09-22 10:13 Oliver Neukum
@ 2020-09-22 10:42 ` Tetsuo Handa
  0 siblings, 0 replies; 19+ messages in thread
From: Tetsuo Handa @ 2020-09-22 10:42 UTC (permalink / raw)
  To: Oliver Neukum, bjorn, linux-usb

On 2020/09/22 19:13, Oliver Neukum wrote:
> +/* flush() is uninterruptible, but we cannot wait forever */
> +#define WDM_FLUSH_TIMEOUT      (30 * HZ)

>  static int wdm_flush(struct file *file, fl_owner_t id)
>  {
>         struct wdm_device *desc = file->private_data;
> +       int rv;
>  
> -       wait_event(desc->wait,
> +       rv = wait_event_timeout(desc->wait,
>                         /*
>                          * needs both flags. We cannot do with one
>                          * because resetting it would cause a race
> @@ -595,16 +630,25 @@ static int wdm_flush(struct file *file, fl_owner_t id)
>                          * a disconnect
>                          */
>                         !test_bit(WDM_IN_USE, &desc->flags) ||
> -                       test_bit(WDM_DISCONNECTING, &desc->flags));
> +                       test_bit(WDM_DISCONNECTING, &desc->flags),
> +                       WDM_FLUSH_TIMEOUT);
>  

Generally looks OK.

Any chance we can use wait_event_killable_timeout() or wait_event_killable() here?
syzkaller sends SIGKILL after 5 seconds from process creation. Blocking for 30
seconds in TASK_UNINTERRUPTIBLE state is not happy when killed by e.g. OOM killer.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [RFC] fixes for hangs and error reporting in CDC_WDM
@ 2020-09-22 10:13 Oliver Neukum
  2020-09-22 10:42 ` Tetsuo Handa
  0 siblings, 1 reply; 19+ messages in thread
From: Oliver Neukum @ 2020-09-22 10:13 UTC (permalink / raw)
  To: penguin-kernel, bjorn, linux-usb

Stress testing has shown that CDC-WDM has some issues with hangs
and error reporting

1. wakeups are not correctly handled in multhreaded environments
2. unresponsive hardware is not handled
3. errors are not correctly reported. This needs flush() to be
implemented.

For easier review all squashed together:

--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -58,6 +58,9 @@ MODULE_DEVICE_TABLE (usb, wdm_ids);
 
 #define WDM_MAX                        16
 
+/* flush() is uninterruptible, but we cannot wait forever */
+#define WDM_FLUSH_TIMEOUT      (30 * HZ)
+
 /* CDC-WMC r1.1 requires wMaxCommand to be "at least 256 decimal (0x100)" */
 #define WDM_DEFAULT_BUFSIZE    256
 
@@ -151,7 +154,7 @@ static void wdm_out_callback(struct urb *urb)
        kfree(desc->outbuf);
        desc->outbuf = NULL;
        clear_bit(WDM_IN_USE, &desc->flags);
-       wake_up(&desc->wait);
+       wake_up_all(&desc->wait);
 }
 
 static void wdm_in_callback(struct urb *urb)
@@ -393,6 +396,9 @@ static ssize_t wdm_write
        if (test_bit(WDM_RESETTING, &desc->flags))
                r = -EIO;
 
+       if (test_bit(WDM_DISCONNECTING, &desc->flags))
+               r = -ENODEV;
+
        if (r < 0) {
                rv = r;
                goto out_free_mem_pm;
@@ -424,6 +430,7 @@ static ssize_t wdm_write
        if (rv < 0) {
                desc->outbuf = NULL;
                clear_bit(WDM_IN_USE, &desc->flags);
+               wake_up_all(&desc->wait); /* for flush() */
                dev_err(&desc->intf->dev, "Tx URB error: %d\n", rv);
                rv = usb_translate_errors(rv);
                goto out_free_mem_pm;
@@ -583,11 +590,39 @@ static ssize_t wdm_read
        return rv;
 }
 
+/*
+ * The difference to flush is that we wait forever. If you don't like
+ * that behavior, you need to send a signal.
+ */
+
+static int wdm_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+       struct wdm_device *desc = file->private_data;
+       int rv;
+
+       rv = wait_event_interruptible(desc->wait,
+                       !test_bit(WDM_IN_USE, &desc->flags) ||
+                       test_bit(WDM_DISCONNECTING, &desc->flags));
+
+       if (test_bit(WDM_DISCONNECTING, &desc->flags))
+               return -ENODEV;
+       if (rv < 0)
+               return -EINTR;
+
+       spin_lock_irq(&desc->iuspin);
+       rv = desc->werr;
+       desc->werr = 0;
+       spin_unlock_irq(&desc->iuspin);
+
+       return usb_translate_errors(rv);
+}
+
 static int wdm_flush(struct file *file, fl_owner_t id)
 {
        struct wdm_device *desc = file->private_data;
+       int rv;
 
-       wait_event(desc->wait,
+       rv = wait_event_timeout(desc->wait,
                        /*
                         * needs both flags. We cannot do with one
                         * because resetting it would cause a race
@@ -595,16 +630,25 @@ static int wdm_flush(struct file *file, fl_owner_t id)
                         * a disconnect
                         */
                        !test_bit(WDM_IN_USE, &desc->flags) ||
-                       test_bit(WDM_DISCONNECTING, &desc->flags));
+                       test_bit(WDM_DISCONNECTING, &desc->flags),
+                       WDM_FLUSH_TIMEOUT);
 
-       /* cannot dereference desc->intf if WDM_DISCONNECTING */
+       /*
+        * to report the correct error.
+        * This is best effort
+        * We are inevitably racing with the hardware.
+        */
        if (test_bit(WDM_DISCONNECTING, &desc->flags))
                return -ENODEV;
-       if (desc->werr < 0)
-               dev_err(&desc->intf->dev, "Error in flush path: %d\n",
-                       desc->werr);
+       if (!rv)
+               return -EIO;
+
+       spin_lock_irq(&desc->iuspin);
+       rv = desc->werr;
+       desc->werr = 0;
+       spin_unlock_irq(&desc->iuspin);
 
-       return usb_translate_errors(desc->werr);
+       return usb_translate_errors(rv);
 }
 
 static __poll_t wdm_poll(struct file *file, struct poll_table_struct *wait)
@@ -729,6 +773,7 @@ static const struct file_operations wdm_fops = {
        .owner =        THIS_MODULE,
        .read =         wdm_read,
        .write =        wdm_write,
+       .fsync =        wdm_fsync,
        .open =         wdm_open,
        .flush =        wdm_flush,


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2020-09-29 10:18 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-22 11:21 [RFC] fixes for hangs and error reporting in CDC_WDM Oliver Neukum
2020-09-22 11:21 ` [RFC 1/7] CDC-WDM: fix hangs in flush() in multithreaded cases Oliver Neukum
2020-09-22 11:21 ` [RFC 2/7] CDC-WDM: introduce a timeout in flush() Oliver Neukum
2020-09-22 11:21 ` [RFC 3/7] CDC-WDM: remove use of intf->dev after potential disconnect Oliver Neukum
2020-09-22 11:21 ` [RFC 4/7] CDC-WDM: fix race reporting errors in flush Oliver Neukum
2020-09-22 11:21 ` [RFC 5/7] CDC-WDM: correct error reporting in write() Oliver Neukum
2020-09-22 11:21 ` [RFC 6/7] CDC-WDM: implement fsync Oliver Neukum
2020-09-22 11:21 ` [RFC 7/7] CDC-WDM: making flush() interruptible Oliver Neukum
2020-09-22 11:38   ` Tetsuo Handa
2020-09-22 11:49     ` Oliver Neukum
2020-09-25 15:11 ` [RFC] fixes for hangs and error reporting in CDC_WDM Greg KH
2020-09-25 15:20   ` Tetsuo Handa
2020-09-25 15:28     ` Greg KH
2020-09-25 15:34       ` Tetsuo Handa
2020-09-26  5:40         ` Greg KH
2020-09-29  8:46           ` Oliver Neukum
2020-09-29 10:17             ` Tetsuo Handa
  -- strict thread matches above, loose matches on Subject: below --
2020-09-22 10:13 Oliver Neukum
2020-09-22 10:42 ` Tetsuo Handa

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.