All of lore.kernel.org
 help / color / mirror / Atom feed
* [LTP] [PATCH 1/2] read_all: Add worker timeout
@ 2022-07-12 12:46 Richard Palethorpe via ltp
  2022-07-12 12:46 ` [LTP] [PATCH 2/2] read_all: Fix type warnings Richard Palethorpe via ltp
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Richard Palethorpe via ltp @ 2022-07-12 12:46 UTC (permalink / raw)
  To: ltp; +Cc: Richard Palethorpe

Kill and restart workers that take too long to read a file. The
default being one second. A custom time can be set with the new -t
option.

This is to prevent a worker from blocking forever in a read. Currently
when this happens the whole test times out and any remaining files in
the worker's queue are not tested.

As a side effect we can now also set the timeout very low to cause
partial reads.

Signed-off-by: Richard Palethorpe <rpalethorpe@suse.com>
Cc: Joerg Vehlow <lkml@jv-coder.de>
Cc: Li Wang <liwang@redhat.com>
---
 testcases/kernel/fs/read_all/read_all.c | 83 ++++++++++++++++++++++++-
 1 file changed, 82 insertions(+), 1 deletion(-)

diff --git a/testcases/kernel/fs/read_all/read_all.c b/testcases/kernel/fs/read_all/read_all.c
index a5b93b966..d8c68bd33 100644
--- a/testcases/kernel/fs/read_all/read_all.c
+++ b/testcases/kernel/fs/read_all/read_all.c
@@ -26,8 +26,10 @@
  * an IPC stress test on systems with large numbers of weak cores. This can be
  * overridden with the 'w' parameters.
  */
+#include <signal.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/wait.h>
 #include <lapi/fnmatch.h>
 #include <stdlib.h>
 #include <stdio.h>
@@ -43,7 +45,9 @@
 #include <pwd.h>
 #include <grp.h>
 
+#include "tst_clocks.h"
 #include "tst_test.h"
+#include "tst_timer.h"
 
 #define QUEUE_SIZE 16384
 #define BUFFER_SIZE 1024
@@ -60,6 +64,7 @@ struct queue {
 struct worker {
 	pid_t pid;
 	struct queue *q;
+	int last_seen;
 };
 
 enum dent_action {
@@ -80,6 +85,8 @@ static char *str_max_workers;
 static long max_workers = 15;
 static struct worker *workers;
 static char *drop_privs;
+static char *str_worker_timeout;
+static int worker_timeout = 1000;
 
 static char *blacklist[] = {
 	NULL, /* reserved for -e parameter */
@@ -227,10 +234,14 @@ static int worker_run(struct worker *self)
 		.sa_flags = 0,
 	};
 	struct queue *q = self->q;
+	struct timespec now;
 
 	sigaction(SIGTTIN, &term_sa, NULL);
 
 	while (1) {
+		tst_clock_gettime(CLOCK_MONOTONIC_RAW, &now);
+		tst_atomic_store(tst_timespec_to_ms(now), &self->last_seen);
+
 		if (!queue_pop(q, buf))
 			break;
 
@@ -270,11 +281,15 @@ static void spawn_workers(void)
 {
 	int i;
 	struct worker *wa = workers;
+	struct timespec now;
+
+	tst_clock_gettime(CLOCK_MONOTONIC_RAW, &now);
 
 	memset(workers, 0, worker_count * sizeof(*workers));
 
 	for (i = 0; i < worker_count; i++) {
 		wa[i].q = queue_init();
+		wa[i].last_seen = tst_timespec_to_ms(now);
 		wa[i].pid = SAFE_FORK();
 		if (!wa[i].pid) {
 			maybe_drop_privs();
@@ -283,9 +298,52 @@ static void spawn_workers(void)
 	}
 }
 
+static void restart_worker(struct worker *const worker)
+{
+	int wstatus, ret, i, q_len;
+	struct timespec now;
+
+	kill(worker->pid, SIGKILL);
+	ret = waitpid(worker->pid, &wstatus, 0);
+
+	if (ret != worker->pid) {
+		tst_brk(TBROK | TERRNO, "waitpid(%d, ...) = %d",
+			worker->pid, ret);
+	}
+
+	/* Make sure the queue length and semaphore match. Threre is a
+	 * race in qeuue_pop where the semaphore can be decremented
+	 * then the worker killed before updating q->front
+	 */
+	q_len = 0;
+	i = worker->q->front;
+	while (i != worker->q->back) {
+		if (!worker->q->data[i])
+			q_len++;
+
+		i = (i + 1) % QUEUE_SIZE;
+	}
+
+	ret = sem_destroy(&worker->q->sem);
+	if (ret == -1)
+		tst_brk(TBROK | TERRNO, "sem_destroy");
+	ret = sem_init(&worker->q->sem, 1, q_len);
+	if (ret == -1)
+		tst_brk(TBROK | TERRNO, "sem_init");
+
+	tst_clock_gettime(CLOCK_MONOTONIC_RAW, &now);
+	worker->last_seen = tst_timespec_to_ms(now);
+	worker->pid = SAFE_FORK();
+
+	if (!worker->pid)
+		exit(worker_run(worker));
+}
+
 static void work_push_retry(int worker, const char *buf)
 {
 	int i, ret, worker_min, worker_max, usleep_time = 100;
+	struct timespec now;
+	int elapsed;
 
 	if (worker < 0) {
 		/* pick any, try -worker first */
@@ -299,10 +357,25 @@ static void work_push_retry(int worker, const char *buf)
 	i = worker_min;
 
 	for (;;) {
-		ret = queue_push(workers[i].q, buf);
+		struct worker *const w = workers + i;
+
+		ret = queue_push(w->q, buf);
 		if (ret == 1)
 			break;
 
+		tst_clock_gettime(CLOCK_MONOTONIC_RAW, &now);
+		elapsed =
+			tst_timespec_to_ms(now) - tst_atomic_load(&w->last_seen);
+
+		if (elapsed > worker_timeout) {
+			if (!quiet) {
+				tst_res(TINFO,
+					"Worker %d (%d) stuck for %dms, restarting it",
+					i, w->pid, elapsed);
+			}
+			restart_worker(w);
+		}
+
 		if (++i >= worker_max) {
 			i = worker_min;
 			if (usleep_time < 100000)
@@ -368,6 +441,12 @@ static void setup(void)
 	if (!worker_count)
 		worker_count = MIN(MAX(tst_ncpus() - 1, 1), max_workers);
 	workers = SAFE_MALLOC(worker_count * sizeof(*workers));
+
+	if (tst_parse_int(str_worker_timeout, &worker_timeout, 1, INT_MAX)) {
+		tst_brk(TBROK,
+			"Invalid worker timeout (-t) argument: '%s'",
+			str_worker_count);
+	}
 }
 
 static void cleanup(void)
@@ -465,6 +544,8 @@ static struct tst_test test = {
 		 "Count Override the worker count. Ignores (-w) and the processor count."},
 		{"p", &drop_privs,
 		 "Drop privileges; switch to the nobody user."},
+		{"t:", &str_worker_timeout,
+		 "Milliseconds a worker has to read a file before it is restarted"},
 		{}
 	},
 	.setup = setup,
-- 
2.36.1


-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [LTP] [PATCH 2/2] read_all: Fix type warnings
  2022-07-12 12:46 [LTP] [PATCH 1/2] read_all: Add worker timeout Richard Palethorpe via ltp
@ 2022-07-12 12:46 ` Richard Palethorpe via ltp
  2022-07-14  4:46   ` Li Wang
  2022-07-14 17:53   ` Petr Vorel
  2022-07-14  4:45 ` [LTP] [PATCH 1/2] read_all: Add worker timeout Li Wang
  2022-07-14  6:58 ` Jan Stancek
  2 siblings, 2 replies; 12+ messages in thread
From: Richard Palethorpe via ltp @ 2022-07-12 12:46 UTC (permalink / raw)
  To: ltp; +Cc: Richard Palethorpe

Signed-off-by: Richard Palethorpe <rpalethorpe@suse.com>
---
 testcases/kernel/fs/read_all/read_all.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/testcases/kernel/fs/read_all/read_all.c b/testcases/kernel/fs/read_all/read_all.c
index d8c68bd33..813991f2c 100644
--- a/testcases/kernel/fs/read_all/read_all.c
+++ b/testcases/kernel/fs/read_all/read_all.c
@@ -168,7 +168,7 @@ static void sanitize_str(char *buf, ssize_t count)
 {
 	int i;
 
-	for (i = 0; i < MIN(count, MAX_DISPLAY); i++)
+	for (i = 0; i < MIN(count, (ssize_t)MAX_DISPLAY); i++)
 		if (!isprint(buf[i]))
 			buf[i] = ' ';
 
@@ -439,7 +439,7 @@ static void setup(void)
 		tst_brk(TBROK, "The directory argument (-d) is required");
 
 	if (!worker_count)
-		worker_count = MIN(MAX(tst_ncpus() - 1, 1), max_workers);
+		worker_count = MIN(MAX(tst_ncpus() - 1, 1L), max_workers);
 	workers = SAFE_MALLOC(worker_count * sizeof(*workers));
 
 	if (tst_parse_int(str_worker_timeout, &worker_timeout, 1, INT_MAX)) {
-- 
2.36.1


-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [LTP] [PATCH 1/2] read_all: Add worker timeout
  2022-07-12 12:46 [LTP] [PATCH 1/2] read_all: Add worker timeout Richard Palethorpe via ltp
  2022-07-12 12:46 ` [LTP] [PATCH 2/2] read_all: Fix type warnings Richard Palethorpe via ltp
@ 2022-07-14  4:45 ` Li Wang
  2022-07-18 10:09   ` Richard Palethorpe
  2022-07-14  6:58 ` Jan Stancek
  2 siblings, 1 reply; 12+ messages in thread
From: Li Wang @ 2022-07-14  4:45 UTC (permalink / raw)
  To: Richard Palethorpe; +Cc: LTP List


[-- Attachment #1.1: Type: text/plain, Size: 1915 bytes --]

Hi Richard,

On Tue, Jul 12, 2022 at 8:46 PM Richard Palethorpe <rpalethorpe@suse.com>
wrote:

> Kill and restart workers that take too long to read a file. The
> default being one second. A custom time can be set with the new -t
> option.
>
> This is to prevent a worker from blocking forever in a read. Currently
> when this happens the whole test times out and any remaining files in
> the worker's queue are not tested.
>
> As a side effect we can now also set the timeout very low to cause
> partial reads.
>

To restart workers which are blocked from reading files is a practical way.
My only concern is that restarted-worker is also slower reading on some
specific files so it will still be timed out again. Then the rest will fall
into
restart cycles. Here I'd suggest loosening the worker_timeout to 3000 ms
to accommodate systems of different IO speeds.

Though each worker at most takes 3secs, the max time would not
be overdue 45s (3 sec * max_works), unless tested sequentially
in the worst case. So bumping up max_runtime maybe also helpful.

Anyway, I'd be happy to apply your patch first to see how things going.
Reviewed-by: Li Wang <liwang@redhat.com>

--- a/testcases/kernel/fs/read_all/read_all.c
+++ b/testcases/kernel/fs/read_all/read_all.c
@@ -64,7 +64,7 @@ struct queue {
 struct worker {
        pid_t pid;
        struct queue *q;
-       int last_seen;
+       unsigned long long last_seen;
 };

 enum dent_action {
@@ -86,7 +86,7 @@ static long max_workers = 15;
 static struct worker *workers;
 static char *drop_privs;
 static char *str_worker_timeout;
-static int worker_timeout = 1000;
+static int worker_timeout = 3000;

 static char *blacklist[] = {
        NULL, /* reserved for -e parameter */
@@ -552,5 +552,6 @@ static struct tst_test test = {
        .cleanup = cleanup,
        .test_all = run,
        .forks_child = 1,
+       .max_runtime = 60,
 };


-- 
Regards,
Li Wang

[-- Attachment #1.2: Type: text/html, Size: 3416 bytes --]

[-- Attachment #2: Type: text/plain, Size: 60 bytes --]


-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [LTP] [PATCH 2/2] read_all: Fix type warnings
  2022-07-12 12:46 ` [LTP] [PATCH 2/2] read_all: Fix type warnings Richard Palethorpe via ltp
@ 2022-07-14  4:46   ` Li Wang
  2022-07-14 17:53   ` Petr Vorel
  1 sibling, 0 replies; 12+ messages in thread
From: Li Wang @ 2022-07-14  4:46 UTC (permalink / raw)
  To: Richard Palethorpe; +Cc: LTP List


[-- Attachment #1.1: Type: text/plain, Size: 1399 bytes --]

On Tue, Jul 12, 2022 at 8:46 PM Richard Palethorpe via ltp <
ltp@lists.linux.it> wrote:

> Signed-off-by: Richard Palethorpe <rpalethorpe@suse.com>
>

Reviewed-by: Li Wang <liwang@redhat.com>


> ---
>  testcases/kernel/fs/read_all/read_all.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/testcases/kernel/fs/read_all/read_all.c
> b/testcases/kernel/fs/read_all/read_all.c
> index d8c68bd33..813991f2c 100644
> --- a/testcases/kernel/fs/read_all/read_all.c
> +++ b/testcases/kernel/fs/read_all/read_all.c
> @@ -168,7 +168,7 @@ static void sanitize_str(char *buf, ssize_t count)
>  {
>         int i;
>
> -       for (i = 0; i < MIN(count, MAX_DISPLAY); i++)
> +       for (i = 0; i < MIN(count, (ssize_t)MAX_DISPLAY); i++)
>                 if (!isprint(buf[i]))
>                         buf[i] = ' ';
>
> @@ -439,7 +439,7 @@ static void setup(void)
>                 tst_brk(TBROK, "The directory argument (-d) is required");
>
>         if (!worker_count)
> -               worker_count = MIN(MAX(tst_ncpus() - 1, 1), max_workers);
> +               worker_count = MIN(MAX(tst_ncpus() - 1, 1L), max_workers);
>         workers = SAFE_MALLOC(worker_count * sizeof(*workers));
>
>         if (tst_parse_int(str_worker_timeout, &worker_timeout, 1,
> INT_MAX)) {
> --
> 2.36.1
>
>
> --
> Mailing list info: https://lists.linux.it/listinfo/ltp
>
>

-- 
Regards,
Li Wang

[-- Attachment #1.2: Type: text/html, Size: 2559 bytes --]

[-- Attachment #2: Type: text/plain, Size: 60 bytes --]


-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [LTP] [PATCH 1/2] read_all: Add worker timeout
  2022-07-12 12:46 [LTP] [PATCH 1/2] read_all: Add worker timeout Richard Palethorpe via ltp
  2022-07-12 12:46 ` [LTP] [PATCH 2/2] read_all: Fix type warnings Richard Palethorpe via ltp
  2022-07-14  4:45 ` [LTP] [PATCH 1/2] read_all: Add worker timeout Li Wang
@ 2022-07-14  6:58 ` Jan Stancek
  2022-07-14 17:56   ` Petr Vorel
  2022-07-18 10:57   ` Richard Palethorpe
  2 siblings, 2 replies; 12+ messages in thread
From: Jan Stancek @ 2022-07-14  6:58 UTC (permalink / raw)
  To: Richard Palethorpe; +Cc: LTP List

On Tue, Jul 12, 2022 at 2:46 PM Richard Palethorpe via ltp
<ltp@lists.linux.it> wrote:
>
> Kill and restart workers that take too long to read a file. The
> default being one second. A custom time can be set with the new -t
> option.
>
> This is to prevent a worker from blocking forever in a read. Currently
> when this happens the whole test times out and any remaining files in
> the worker's queue are not tested.
>
> As a side effect we can now also set the timeout very low to cause
> partial reads.
>
> Signed-off-by: Richard Palethorpe <rpalethorpe@suse.com>
> Cc: Joerg Vehlow <lkml@jv-coder.de>
> Cc: Li Wang <liwang@redhat.com>
> ---
>  testcases/kernel/fs/read_all/read_all.c | 83 ++++++++++++++++++++++++-
>  1 file changed, 82 insertions(+), 1 deletion(-)

>
> +static void restart_worker(struct worker *const worker)
> +{
> +       int wstatus, ret, i, q_len;
> +       struct timespec now;
> +
> +       kill(worker->pid, SIGKILL);
> +       ret = waitpid(worker->pid, &wstatus, 0);

Is there a chance we could get stuck in uninterruptible read? I think I saw some
in past, but those may be blacklisted already, so this may only be something
to watch for if we still get test timeouts in future.

> +
> +       if (ret != worker->pid) {
> +               tst_brk(TBROK | TERRNO, "waitpid(%d, ...) = %d",
> +                       worker->pid, ret);
> +       }
> +
> +       /* Make sure the queue length and semaphore match. Threre is a
> +        * race in qeuue_pop where the semaphore can be decremented
^^ typo in queue_pop above

> +        * then the worker killed before updating q->front
> +        */
> +       q_len = 0;
> +       i = worker->q->front;
> +       while (i != worker->q->back) {
> +               if (!worker->q->data[i])
> +                       q_len++;
> +
> +               i = (i + 1) % QUEUE_SIZE;
> +       }
> +
> +       ret = sem_destroy(&worker->q->sem);
> +       if (ret == -1)
> +               tst_brk(TBROK | TERRNO, "sem_destroy");
> +       ret = sem_init(&worker->q->sem, 1, q_len);
> +       if (ret == -1)
> +               tst_brk(TBROK | TERRNO, "sem_init");
> +
> +       tst_clock_gettime(CLOCK_MONOTONIC_RAW, &now);
> +       worker->last_seen = tst_timespec_to_ms(now);
> +       worker->pid = SAFE_FORK();
> +
> +       if (!worker->pid)
> +               exit(worker_run(worker));
> +}
> +
>  static void work_push_retry(int worker, const char *buf)
>  {
>         int i, ret, worker_min, worker_max, usleep_time = 100;
> +       struct timespec now;
> +       int elapsed;
>
>         if (worker < 0) {
>                 /* pick any, try -worker first */
> @@ -299,10 +357,25 @@ static void work_push_retry(int worker, const char *buf)
>         i = worker_min;
>
>         for (;;) {
> -               ret = queue_push(workers[i].q, buf);
> +               struct worker *const w = workers + i;
> +
> +               ret = queue_push(w->q, buf);
>                 if (ret == 1)
>                         break;
>
> +               tst_clock_gettime(CLOCK_MONOTONIC_RAW, &now);
> +               elapsed =
> +                       tst_timespec_to_ms(now) - tst_atomic_load(&w->last_seen);
> +
> +               if (elapsed > worker_timeout) {
> +                       if (!quiet) {
> +                               tst_res(TINFO,
> +                                       "Worker %d (%d) stuck for %dms, restarting it",
> +                                       i, w->pid, elapsed);

Can we also print file worker is stuck on?
And as Li pointed out, I'd also extend max_runtime to 60 seconds.

Acked-by: Jan Stancek <jstancek@redhat.com>


-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [LTP] [PATCH 2/2] read_all: Fix type warnings
  2022-07-12 12:46 ` [LTP] [PATCH 2/2] read_all: Fix type warnings Richard Palethorpe via ltp
  2022-07-14  4:46   ` Li Wang
@ 2022-07-14 17:53   ` Petr Vorel
  1 sibling, 0 replies; 12+ messages in thread
From: Petr Vorel @ 2022-07-14 17:53 UTC (permalink / raw)
  To: Richard Palethorpe; +Cc: ltp

Hi Richie,

Obviously correct, thanks!

Reviewed-by: Petr Vorel <pvorel@suse.cz>

Kind regards,
Petr

-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [LTP] [PATCH 1/2] read_all: Add worker timeout
  2022-07-14  6:58 ` Jan Stancek
@ 2022-07-14 17:56   ` Petr Vorel
  2022-07-18 10:37     ` Richard Palethorpe
  2022-07-18 10:57   ` Richard Palethorpe
  1 sibling, 1 reply; 12+ messages in thread
From: Petr Vorel @ 2022-07-14 17:56 UTC (permalink / raw)
  To: Jan Stancek; +Cc: Richard Palethorpe, LTP List

Hi all,

Reviewed-by: Petr Vorel <pvorel@suse.cz>

> > +static void restart_worker(struct worker *const worker)
> > +{
> > +       int wstatus, ret, i, q_len;
> > +       struct timespec now;
> > +
> > +       kill(worker->pid, SIGKILL);
> > +       ret = waitpid(worker->pid, &wstatus, 0);

> Is there a chance we could get stuck in uninterruptible read? I think I saw some
> in past, but those may be blacklisted already, so this may only be something
> to watch for if we still get test timeouts in future.

+1

...
> > +       if (ret != worker->pid) {
> > +               tst_brk(TBROK | TERRNO, "waitpid(%d, ...) = %d",
> > +                       worker->pid, ret);
> > +       }
> > +
> > +       /* Make sure the queue length and semaphore match. Threre is a
> > +        * race in qeuue_pop where the semaphore can be decremented
> ^^ typo in queue_pop above

...
> > +               tst_clock_gettime(CLOCK_MONOTONIC_RAW, &now);
> > +               elapsed =
> > +                       tst_timespec_to_ms(now) - tst_atomic_load(&w->last_seen);
> > +
> > +               if (elapsed > worker_timeout) {
> > +                       if (!quiet) {
> > +                               tst_res(TINFO,
> > +                                       "Worker %d (%d) stuck for %dms, restarting it",
> > +                                       i, w->pid, elapsed);

> Can we also print file worker is stuck on?
> And as Li pointed out, I'd also extend max_runtime to 60 seconds.

+1, all these additional changes make sense to me.

Kind regards,
Petr

-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [LTP] [PATCH 1/2] read_all: Add worker timeout
  2022-07-14  4:45 ` [LTP] [PATCH 1/2] read_all: Add worker timeout Li Wang
@ 2022-07-18 10:09   ` Richard Palethorpe
  2022-07-19  8:58     ` Li Wang
  0 siblings, 1 reply; 12+ messages in thread
From: Richard Palethorpe @ 2022-07-18 10:09 UTC (permalink / raw)
  To: Li Wang; +Cc: LTP List

Hello Li,

Li Wang <liwang@redhat.com> writes:

> Hi Richard,
>
> On Tue, Jul 12, 2022 at 8:46 PM Richard Palethorpe <rpalethorpe@suse.com> wrote:
>
>  Kill and restart workers that take too long to read a file. The
>  default being one second. A custom time can be set with the new -t
>  option.
>
>  This is to prevent a worker from blocking forever in a read. Currently
>  when this happens the whole test times out and any remaining files in
>  the worker's queue are not tested.
>
>  As a side effect we can now also set the timeout very low to cause
>  partial reads.
>
> To restart workers which are blocked from reading files is a practical way.
> My only concern is that restarted-worker is also slower reading on some
> specific files so it will still be timed out again. Then the rest will fall into 
> restart cycles. Here I'd suggest loosening the worker_timeout to 3000 ms
> to accommodate systems of different IO speeds.

I didn't observe any issues setting the timeout very low
(<100ms). Worker's remove an item from the queue before trying to read
it, so they shouldn't get stuck in a restart cycle on the same file (if
thats what you meant).

>
> Though each worker at most takes 3secs, the max time would not
> be overdue 45s (3 sec * max_works), unless tested sequentially
> in the worst case. So bumping up max_runtime maybe also helpful.
>
> Anyway, I'd be happy to apply your patch first to see how things going.
> Reviewed-by: Li Wang <liwang@redhat.com>
>
> --- a/testcases/kernel/fs/read_all/read_all.c
> +++ b/testcases/kernel/fs/read_all/read_all.c
> @@ -64,7 +64,7 @@ struct queue {
>  struct worker {
>         pid_t pid;
>         struct queue *q;
> -       int last_seen;
> +       unsigned long long last_seen;
>  };
>  
>  enum dent_action {
> @@ -86,7 +86,7 @@ static long max_workers = 15;
>  static struct worker *workers;
>  static char *drop_privs;
>  static char *str_worker_timeout;
> -static int worker_timeout = 1000;
> +static int worker_timeout = 3000;
>  
>  static char *blacklist[] = {
>         NULL, /* reserved for -e parameter */
> @@ -552,5 +552,6 @@ static struct tst_test test = {
>         .cleanup = cleanup,
>         .test_all = run,
>         .forks_child = 1,
> +       .max_runtime = 60,
>  };

I'm not sure if this is better or worse. In my sys folder there are
approx 35K files. Obviously a timeout of even 1s is far too long to read
all of them if they all timeout. In fact if we set the timeout as 100ms
then they would still require about 58 mins.

Of course most of them take a very short time on most systems. I guess
that ideally the timeout needs to be adapted as the test runs so that
all remaining files can be read. The issue with that though is that kill+wait+fork
takes more time than most reads. Although that can be measured as
well....

This issue is quite a lot like the issues we have with fuzzy sync. I
think it's maybe time to start dynamically adapting to system
performance. That's probably best left to another patch series though.

-- 
Thank you,
Richard.

-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [LTP] [PATCH 1/2] read_all: Add worker timeout
  2022-07-14 17:56   ` Petr Vorel
@ 2022-07-18 10:37     ` Richard Palethorpe
  0 siblings, 0 replies; 12+ messages in thread
From: Richard Palethorpe @ 2022-07-18 10:37 UTC (permalink / raw)
  To: Petr Vorel; +Cc: LTP List

Hello,

Petr Vorel <pvorel@suse.cz> writes:

> Hi all,
>
> Reviewed-by: Petr Vorel <pvorel@suse.cz>
>
>> > +static void restart_worker(struct worker *const worker)
>> > +{
>> > +       int wstatus, ret, i, q_len;
>> > +       struct timespec now;
>> > +
>> > +       kill(worker->pid, SIGKILL);
>> > +       ret = waitpid(worker->pid, &wstatus, 0);
>
>> Is there a chance we could get stuck in uninterruptible read? I think I saw some
>> in past, but those may be blacklisted already, so this may only be something
>> to watch for if we still get test timeouts in future.
>
> +1
>
> ...
>> > +       if (ret != worker->pid) {
>> > +               tst_brk(TBROK | TERRNO, "waitpid(%d, ...) = %d",
>> > +                       worker->pid, ret);
>> > +       }
>> > +
>> > +       /* Make sure the queue length and semaphore match. Threre is a
>> > +        * race in qeuue_pop where the semaphore can be decremented
>> ^^ typo in queue_pop above
>
> ...
>> > +               tst_clock_gettime(CLOCK_MONOTONIC_RAW, &now);
>> > +               elapsed =
>> > +                       tst_timespec_to_ms(now) - tst_atomic_load(&w->last_seen);
>> > +
>> > +               if (elapsed > worker_timeout) {
>> > +                       if (!quiet) {
>> > +                               tst_res(TINFO,
>> > +                                       "Worker %d (%d) stuck for %dms, restarting it",
>> > +                                       i, w->pid, elapsed);
>
>> Can we also print file worker is stuck on?
>> And as Li pointed out, I'd also extend max_runtime to 60 seconds.
>
> +1, all these additional changes make sense to me.

OK I'll make these changes, thanks!

>
> Kind regards,
> Petr


-- 
Thank you,
Richard.

-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [LTP] [PATCH 1/2] read_all: Add worker timeout
  2022-07-14  6:58 ` Jan Stancek
  2022-07-14 17:56   ` Petr Vorel
@ 2022-07-18 10:57   ` Richard Palethorpe
  2022-07-18 13:01     ` Richard Palethorpe
  1 sibling, 1 reply; 12+ messages in thread
From: Richard Palethorpe @ 2022-07-18 10:57 UTC (permalink / raw)
  To: Jan Stancek; +Cc: LTP List

Hello,

Jan Stancek <jstancek@redhat.com> writes:

> On Tue, Jul 12, 2022 at 2:46 PM Richard Palethorpe via ltp
> <ltp@lists.linux.it> wrote:
>>
>> Kill and restart workers that take too long to read a file. The
>> default being one second. A custom time can be set with the new -t
>> option.
>>
>> This is to prevent a worker from blocking forever in a read. Currently
>> when this happens the whole test times out and any remaining files in
>> the worker's queue are not tested.
>>
>> As a side effect we can now also set the timeout very low to cause
>> partial reads.
>>
>> Signed-off-by: Richard Palethorpe <rpalethorpe@suse.com>
>> Cc: Joerg Vehlow <lkml@jv-coder.de>
>> Cc: Li Wang <liwang@redhat.com>
>> ---
>>  testcases/kernel/fs/read_all/read_all.c | 83 ++++++++++++++++++++++++-
>>  1 file changed, 82 insertions(+), 1 deletion(-)
>
>>
>> +static void restart_worker(struct worker *const worker)
>> +{
>> +       int wstatus, ret, i, q_len;
>> +       struct timespec now;
>> +
>> +       kill(worker->pid, SIGKILL);
>> +       ret = waitpid(worker->pid, &wstatus, 0);
>
> Is there a chance we could get stuck in uninterruptible read? I think I saw some
> in past, but those may be blacklisted already, so this may only be something
> to watch for if we still get test timeouts in future.
>

I was hoping that kill is special somehow, but I suppose that I should
check exactly what happens. If the process is stuck inside the kernel
then we don't want to wait too long for it. We just need to know that
the kill signal was delivered and that the process will not return to
userland. If we have a large number of zombies then it could exhaust the
PIDs or some other resource, but most reads are done very quickly and
don't need interrupting.

-- 
Thank you,
Richard.

-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [LTP] [PATCH 1/2] read_all: Add worker timeout
  2022-07-18 10:57   ` Richard Palethorpe
@ 2022-07-18 13:01     ` Richard Palethorpe
  0 siblings, 0 replies; 12+ messages in thread
From: Richard Palethorpe @ 2022-07-18 13:01 UTC (permalink / raw)
  To: Jan Stancek; +Cc: LTP List

Hello,

Richard Palethorpe <rpalethorpe@suse.de> writes:

> Hello,
>
> Jan Stancek <jstancek@redhat.com> writes:
>
>> On Tue, Jul 12, 2022 at 2:46 PM Richard Palethorpe via ltp
>> <ltp@lists.linux.it> wrote:
>>>
>>> Kill and restart workers that take too long to read a file. The
>>> default being one second. A custom time can be set with the new -t
>>> option.
>>>
>>> This is to prevent a worker from blocking forever in a read. Currently
>>> when this happens the whole test times out and any remaining files in
>>> the worker's queue are not tested.
>>>
>>> As a side effect we can now also set the timeout very low to cause
>>> partial reads.
>>>
>>> Signed-off-by: Richard Palethorpe <rpalethorpe@suse.com>
>>> Cc: Joerg Vehlow <lkml@jv-coder.de>
>>> Cc: Li Wang <liwang@redhat.com>
>>> ---
>>>  testcases/kernel/fs/read_all/read_all.c | 83 ++++++++++++++++++++++++-
>>>  1 file changed, 82 insertions(+), 1 deletion(-)
>>
>>>
>>> +static void restart_worker(struct worker *const worker)
>>> +{
>>> +       int wstatus, ret, i, q_len;
>>> +       struct timespec now;
>>> +
>>> +       kill(worker->pid, SIGKILL);
>>> +       ret = waitpid(worker->pid, &wstatus, 0);
>>
>> Is there a chance we could get stuck in uninterruptible read? I think I saw some
>> in past, but those may be blacklisted already, so this may only be something
>> to watch for if we still get test timeouts in future.
>>
>
> I was hoping that kill is special somehow, but I suppose that I should
> check exactly what happens. If the process is stuck inside the kernel
> then we don't want to wait too long for it. We just need to know that
> the kill signal was delivered and that the process will not return to
> userland. If we have a large number of zombies then it could exhaust the
> PIDs or some other resource, but most reads are done very quickly and
> don't need interrupting.

AFAICT fatal signals are special which means a reschedule event is sent
immediately on kill. However I guess the IPI doesn't get delivered to
another CPU synchronously, not to mention that preemption might be
disabled, so we don't know what state everything is in by the time
kill() returns.

I guess also some long running kernel code will continue to run after it
has received kill and been rescheduled. So wait could block for a longer
period. In this case though we don't need to worry about the process we
are trying to kill returning to userland.

Regardless of whether all that is correct we can mark the worker as
failed, call kill then call waitpid with WNOHANG. Then either restart it
or return to the main loop. If we didn't restart it, the next time we
encounter that worker we can call waitpid again. After some number of
failures we can abandon waiting on it and create a new process.

-- 
Thank you,
Richard.

-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [LTP] [PATCH 1/2] read_all: Add worker timeout
  2022-07-18 10:09   ` Richard Palethorpe
@ 2022-07-19  8:58     ` Li Wang
  0 siblings, 0 replies; 12+ messages in thread
From: Li Wang @ 2022-07-19  8:58 UTC (permalink / raw)
  To: Richard Palethorpe; +Cc: LTP List


[-- Attachment #1.1: Type: text/plain, Size: 2343 bytes --]

On Mon, Jul 18, 2022 at 6:37 PM Richard Palethorpe <rpalethorpe@suse.de>
wrote:

> Hello Li,
>
> Li Wang <liwang@redhat.com> writes:
>
> > Hi Richard,
> >
> > On Tue, Jul 12, 2022 at 8:46 PM Richard Palethorpe <rpalethorpe@suse.com>
> wrote:
> >
> >  Kill and restart workers that take too long to read a file. The
> >  default being one second. A custom time can be set with the new -t
> >  option.
> >
> >  This is to prevent a worker from blocking forever in a read. Currently
> >  when this happens the whole test times out and any remaining files in
> >  the worker's queue are not tested.
> >
> >  As a side effect we can now also set the timeout very low to cause
> >  partial reads.
> >
> > To restart workers which are blocked from reading files is a practical
> way.
> > My only concern is that restarted-worker is also slower reading on some
> > specific files so it will still be timed out again. Then the rest will
> fall into
> > restart cycles. Here I'd suggest loosening the worker_timeout to 3000 ms
> > to accommodate systems of different IO speeds.
>
> I didn't observe any issues setting the timeout very low
> (<100ms). Worker's remove an item from the queue before trying to read
> it, so they shouldn't get stuck in a restart cycle on the same file (if
> thats what you meant).
>

Ah yes, exactly.



> I'm not sure if this is better or worse. In my sys folder there are
> approx 35K files. Obviously a timeout of even 1s is far too long to read
> all of them if they all timeout. In fact if we set the timeout as 100ms
> then they would still require about 58 mins.
>

At the moment we just give a rough value on that as nobody has lots of
sample data.


>
> Of course most of them take a very short time on most systems. I guess
> that ideally the timeout needs to be adapted as the test runs so that
> all remaining files can be read. The issue with that though is that
> kill+wait+fork
> takes more time than most reads. Although that can be measured as
> well....
>
> This issue is quite a lot like the issues we have with fuzzy sync. I
> think it's maybe time to start dynamically adapting to system
> performance. That's probably best left to another patch series though.
>

Yes, collecting sample data can help evaluate the performance.
That's a practical way to make things more reliable.

-- 
Regards,
Li Wang

[-- Attachment #1.2: Type: text/html, Size: 3777 bytes --]

[-- Attachment #2: Type: text/plain, Size: 60 bytes --]


-- 
Mailing list info: https://lists.linux.it/listinfo/ltp

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2022-07-19  8:58 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-12 12:46 [LTP] [PATCH 1/2] read_all: Add worker timeout Richard Palethorpe via ltp
2022-07-12 12:46 ` [LTP] [PATCH 2/2] read_all: Fix type warnings Richard Palethorpe via ltp
2022-07-14  4:46   ` Li Wang
2022-07-14 17:53   ` Petr Vorel
2022-07-14  4:45 ` [LTP] [PATCH 1/2] read_all: Add worker timeout Li Wang
2022-07-18 10:09   ` Richard Palethorpe
2022-07-19  8:58     ` Li Wang
2022-07-14  6:58 ` Jan Stancek
2022-07-14 17:56   ` Petr Vorel
2022-07-18 10:37     ` Richard Palethorpe
2022-07-18 10:57   ` Richard Palethorpe
2022-07-18 13:01     ` Richard Palethorpe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.